1 files changed, 7383 insertions, 0 deletions
diff --git a/packages/glibc/2.17/0052-glibc-ppc64le-30.patch b/packages/glibc/2.17/0052-glibc-ppc64le-30.patch
new file mode 100644
index 0000000..3834dcc
--- /dev/null
+++ b/packages/glibc/2.17/0052-glibc-ppc64le-30.patch
@@ -0,0 +1,7383 @@
+# commit fe6e95d7171eba5f3e07848f081676fae4e86322
+# Author: Alan Modra <amodra@gmail.com>
+# Date:   Sat Aug 17 18:46:47 2013 +0930
+# 
+#     PowerPC LE memcmp
+#     http://sourceware.org/ml/libc-alpha/2013-08/msg00102.html
+#     
+#     This is a rather large patch due to formatting and renaming.  The
+#     formatting changes were to make it possible to compare power7 and
+#     power4 versions of memcmp.  Using different register defines came
+#     about while I was wrestling with the code, trying to find spare
+#     registers at one stage.  I found it much simpler if we refer to a reg
+#     by the same name throughout a function, so it's better if short-term
+#     multiple use regs like rTMP are referred to using their register
+#     number.  I made the cr field usage changes when attempting to reload
+#     rWORDn regs in the exit path to byte swap before comparing when
+#     little-endian.  That proved a bad idea due to the pipelining involved
+#     in the main loop;  Offsets to reload the regs were different first
+#     time around the loop..  Anyway, I left the cr field usage changes in
+#     place for consistency.
+#     
+#     Aside from these more-or-less cosmetic changes, I fixed a number of
+#     places where an early exit path restores regs unnecessarily, removed
+#     some dead code, and optimised one or two exits.
+#     
+#         * sysdeps/powerpc/powerpc64/power7/memcmp.S: Add little-endian support.
+#         Formatting.  Consistently use rXXX register defines or rN defines.
+#         Use early exit labels that avoid restoring unused non-volatile regs.
+#         Make cr field use more consistent with rWORDn compares.  Rename
+#         regs used as shift registers for unaligned loop, using rN defines
+#         for short lifetime/multiple use regs.
+#         * sysdeps/powerpc/powerpc64/power4/memcmp.S: Likewise.
+#         * sysdeps/powerpc/powerpc32/power7/memcmp.S: Likewise.  Exit with
+#         addi 1,1,64 to pop stack frame.  Simplify return value code.
+#         * sysdeps/powerpc/powerpc32/power4/memcmp.S: Likewise.
+# 
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcmp.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcmp.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcmp.S	2014-05-28 19:22:37.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power4/memcmp.S	2014-05-28 23:55:52.000000000 -0500
+@@ -1,4 +1,4 @@
+-/* Optimized strcmp implementation for PowerPC64.
++/* Optimized strcmp implementation for PowerPC32.
+    Copyright (C) 2003, 2006 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+@@ -20,13 +20,14 @@
+ #include <bp-sym.h>
+ #include <bp-asm.h>
+ 
+-/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
+-
++/* int [r3] memcmp (const char *s1 [r3],
++		    const char *s2 [r4],
++		    size_t size [r5])  */
++ 
+ 	.machine power4
+ EALIGN (BP_SYM(memcmp), 4, 0)
+ 	CALL_MCOUNT
+ 
+-#define rTMP	r0
+ #define rRTN	r3
+ #define rSTR1	r3	/* first string arg */
+ #define rSTR2	r4	/* second string arg */
+@@ -37,33 +38,32 @@
+ #define rWORD4	r9	/* next word in s2 */
+ #define rWORD5	r10	/* next word in s1 */
+ #define rWORD6	r11	/* next word in s2 */
+-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
+ #define rWORD7	r30	/* next word in s1 */
+ #define rWORD8	r31	/* next word in s2 */
+ 
+-	xor	rTMP, rSTR2, rSTR1
++	xor	r0, rSTR2, rSTR1
+ 	cmplwi	cr6, rN, 0
+ 	cmplwi	cr1, rN, 12
+-	clrlwi.	rTMP, rTMP, 30
+-	clrlwi	rBITDIF, rSTR1, 30
+-	cmplwi	cr5, rBITDIF, 0
++	clrlwi.	r0, r0, 30
++	clrlwi	r12, rSTR1, 30
++	cmplwi	cr5, r12, 0
+ 	beq-	cr6, L(zeroLength)
+-	dcbt	0,rSTR1
+-	dcbt	0,rSTR2
++	dcbt	0, rSTR1
++	dcbt	0, rSTR2
+ /* If less than 8 bytes or not aligned, use the unaligned
+    byte loop.  */
+ 	blt	cr1, L(bytealigned)
+-        stwu    1,-64(1)
++	stwu	1, -64(r1)
+ 	cfi_adjust_cfa_offset(64)
+-        stw     r31,48(1)	
+-	cfi_offset(31,(48-64))
+-        stw     r30,44(1)	
+-	cfi_offset(30,(44-64))
++	stw	rWORD8, 48(r1)
++	cfi_offset(rWORD8, (48-64))
++	stw	rWORD7, 44(r1)
++	cfi_offset(rWORD7, (44-64))
+ 	bne	L(unaligned)
+ /* At this point we know both strings have the same alignment and the
+-   compare length is at least 8 bytes.  rBITDIF contains the low order
++   compare length is at least 8 bytes.  r12 contains the low order
+    2 bits of rSTR1 and cr5 contains the result of the logical compare
+-   of rBITDIF to 0.  If rBITDIF == 0 then we are already word 
++   of r12 to 0.  If r12 == 0 then we are already word
+    aligned and can perform the word aligned loop.
+   
+    Otherwise we know the two strings have the same alignment (but not
+@@ -72,74 +72,95 @@
+    eliminate bits preceeding the first byte.  Since we want to join the
+    normal (word aligned) compare loop, starting at the second word,
+    we need to adjust the length (rN) and special case the loop
+-   versioning for the first word. This insures that the loop count is
++   versioning for the first word. This ensures that the loop count is
+    correct and the first word (shifted) is in the expected register pair. */
+-	.align 4
++	.align	4
+ L(samealignment):
+ 	clrrwi	rSTR1, rSTR1, 2
+ 	clrrwi	rSTR2, rSTR2, 2
+ 	beq	cr5, L(Waligned)
+-	add	rN, rN, rBITDIF
+-	slwi	r11, rBITDIF, 3
+-	srwi	rTMP, rN, 4	 /* Divide by 16 */
+-	andi.	rBITDIF, rN, 12  /* Get the word remainder */
++	add	rN, rN, r12
++	slwi	rWORD6, r12, 3
++	srwi	r0, rN, 4	/* Divide by 16 */
++	andi.	r12, rN, 12	/* Get the word remainder */
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 0(rSTR1)
+ 	lwz	rWORD2, 0(rSTR2)
+-	cmplwi	cr1, rBITDIF, 8
++#endif
++	cmplwi	cr1, r12, 8
+ 	cmplwi	cr7, rN, 16
+ 	clrlwi	rN, rN, 30
+ 	beq	L(dPs4)
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+ 	bgt	cr1, L(dPs3)
+ 	beq	cr1, L(dPs2)
+ 
+ /* Remainder is 4 */
+-	.align 3
++	.align	3
+ L(dsP1):
+-	slw	rWORD5, rWORD1, r11
+-	slw	rWORD6, rWORD2, r11
++	slw	rWORD5, rWORD1, rWORD6
++	slw	rWORD6, rWORD2, rWORD6
+ 	cmplw	cr5, rWORD5, rWORD6
+ 	blt	cr7, L(dP1x)
+ /* Do something useful in this cycle since we have to branch anyway.  */
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 4(rSTR1)
+ 	lwz	rWORD2, 4(rSTR2)
+-	cmplw	cr0, rWORD1, rWORD2
++#endif
++	cmplw	cr7, rWORD1, rWORD2
+ 	b	L(dP1e)
+ /* Remainder is 8 */
+-	.align 4
++	.align	4
+ L(dPs2):
+-	slw	rWORD5, rWORD1, r11
+-	slw	rWORD6, rWORD2, r11
++	slw	rWORD5, rWORD1, rWORD6
++	slw	rWORD6, rWORD2, rWORD6
+ 	cmplw	cr6, rWORD5, rWORD6
+ 	blt	cr7, L(dP2x)
+ /* Do something useful in this cycle since we have to branch anyway.  */
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD7, 4(rSTR1)
+ 	lwz	rWORD8, 4(rSTR2)
++#endif
+ 	cmplw	cr5, rWORD7, rWORD8
+ 	b	L(dP2e)
+ /* Remainder is 12 */
+-	.align 4
++	.align	4
+ L(dPs3):
+-	slw	rWORD3, rWORD1, r11
+-	slw	rWORD4, rWORD2, r11
++	slw	rWORD3, rWORD1, rWORD6
++	slw	rWORD4, rWORD2, rWORD6
+ 	cmplw	cr1, rWORD3, rWORD4
+ 	b	L(dP3e)
+ /* Count is a multiple of 16, remainder is 0 */
+-	.align 4
++	.align	4
+ L(dPs4):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+-	slw	rWORD1, rWORD1, r11
+-	slw	rWORD2, rWORD2, r11
+-	cmplw	cr0, rWORD1, rWORD2
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++	slw	rWORD1, rWORD1, rWORD6
++	slw	rWORD2, rWORD2, rWORD6
++	cmplw	cr7, rWORD1, rWORD2
+ 	b	L(dP4e)
+ 
+ /* At this point we know both strings are word aligned and the
+    compare length is at least 8 bytes.  */
+-	.align 4
++	.align	4
+ L(Waligned):
+-	andi.	rBITDIF, rN, 12  /* Get the word remainder */
+-	srwi	rTMP, rN, 4	 /* Divide by 16 */
+-	cmplwi	cr1, rBITDIF, 8
++	andi.	r12, rN, 12	/* Get the word remainder */
++	srwi	r0, rN, 4	/* Divide by 16 */
++	cmplwi	cr1, r12, 8
+ 	cmplwi	cr7, rN, 16
+ 	clrlwi	rN, rN, 30
+ 	beq	L(dP4)
+@@ -147,177 +168,352 @@
+ 	beq	cr1, L(dP2)
+ 		
+ /* Remainder is 4 */
+-	.align 4
++	.align	4
+ L(dP1):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+ /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+    (8-15 byte compare), we want to use only volatile registers.  This
+    means we can avoid restoring non-volatile registers since we did not
+    change any on the early exit path.  The key here is the non-early
+    exit path only cares about the condition code (cr5), not about which 
+    register pair was used.  */
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD5, 0(rSTR1)
+ 	lwz	rWORD6, 0(rSTR2)
++#endif
+ 	cmplw	cr5, rWORD5, rWORD6
+ 	blt	cr7, L(dP1x)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 4(rSTR1)
+ 	lwz	rWORD2, 4(rSTR2)
+-	cmplw	cr0, rWORD1, rWORD2
++#endif
++	cmplw	cr7, rWORD1, rWORD2
+ L(dP1e):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD3, 8(rSTR1)
+ 	lwz	rWORD4, 8(rSTR2)
++#endif
+ 	cmplw	cr1, rWORD3, rWORD4
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD5, 12(rSTR1)
+ 	lwz	rWORD6, 12(rSTR2)
++#endif
+ 	cmplw	cr6, rWORD5, rWORD6
+-	bne	cr5, L(dLcr5)
+-	bne	cr0, L(dLcr0)
+-	
++	bne	cr5, L(dLcr5x)
++	bne	cr7, L(dLcr7x)
++
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwzu	rWORD7, 16(rSTR1)
+ 	lwzu	rWORD8, 16(rSTR2)
++#endif
+ 	bne	cr1, L(dLcr1)
+ 	cmplw	cr5, rWORD7, rWORD8
+ 	bdnz	L(dLoop)
+ 	bne	cr6, L(dLcr6)
+-        lwz     r30,44(1)
+-        lwz     r31,48(1)
+-	.align 3
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
++	.align	3
+ L(dP1x):
+ 	slwi.	r12, rN, 3
+-	bne	cr5, L(dLcr5)
++	bne	cr5, L(dLcr5x)
+ 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+-        lwz     1,0(1)
++	addi	1, 1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bne	L(d00)
+ 	li	rRTN, 0
+ 	blr
+ 		
+ /* Remainder is 8 */
+-	.align 4
++	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dP2):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD5, 0(rSTR1)
+ 	lwz	rWORD6, 0(rSTR2)
++#endif
+ 	cmplw	cr6, rWORD5, rWORD6
+ 	blt	cr7, L(dP2x)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD7, 4(rSTR1)
+ 	lwz	rWORD8, 4(rSTR2)
++#endif
+ 	cmplw	cr5, rWORD7, rWORD8
+ L(dP2e):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 8(rSTR1)
+ 	lwz	rWORD2, 8(rSTR2)
+-	cmplw	cr0, rWORD1, rWORD2
++#endif
++	cmplw	cr7, rWORD1, rWORD2
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD3, 12(rSTR1)
+ 	lwz	rWORD4, 12(rSTR2)
++#endif
+ 	cmplw	cr1, rWORD3, rWORD4
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 4
+ 	addi	rSTR2, rSTR2, 4
++#endif
+ 	bne	cr6, L(dLcr6)
+ 	bne	cr5, L(dLcr5)
+ 	b	L(dLoop2)
+ /* Again we are on a early exit path (16-23 byte compare), we want to
+    only use volatile registers and avoid restoring non-volatile
+    registers.  */
+-	.align 4
++	.align	4
+ L(dP2x):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD3, 4(rSTR1)
+ 	lwz	rWORD4, 4(rSTR2)
+-	cmplw	cr5, rWORD3, rWORD4
++#endif
++	cmplw	cr1, rWORD3, rWORD4
+ 	slwi.	r12, rN, 3
+-	bne	cr6, L(dLcr6)
++	bne	cr6, L(dLcr6x)
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 4
+ 	addi	rSTR2, rSTR2, 4
+-	bne	cr5, L(dLcr5)
++#endif
++	bne	cr1, L(dLcr1x)
+ 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+-        lwz     1,0(1)
++	addi	1, 1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bne	L(d00)
+ 	li	rRTN, 0
+ 	blr
+ 		
+ /* Remainder is 12 */
+-	.align 4
++	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dP3):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD3, 0(rSTR1)
+ 	lwz	rWORD4, 0(rSTR2)
++#endif
+ 	cmplw	cr1, rWORD3, rWORD4
+ L(dP3e):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD5, 4(rSTR1)
+ 	lwz	rWORD6, 4(rSTR2)
++#endif
+ 	cmplw	cr6, rWORD5, rWORD6
+ 	blt	cr7, L(dP3x)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD7, 8(rSTR1)
+ 	lwz	rWORD8, 8(rSTR2)
++#endif
+ 	cmplw	cr5, rWORD7, rWORD8
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 12(rSTR1)
+ 	lwz	rWORD2, 12(rSTR2)
+-	cmplw	cr0, rWORD1, rWORD2
++#endif
++	cmplw	cr7, rWORD1, rWORD2
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 8
+ 	addi	rSTR2, rSTR2, 8
++#endif
+ 	bne	cr1, L(dLcr1)
+ 	bne	cr6, L(dLcr6)
+ 	b	L(dLoop1)
+ /* Again we are on a early exit path (24-31 byte compare), we want to
+    only use volatile registers and avoid restoring non-volatile
+    registers.  */
+-	.align 4
++	.align	4
+ L(dP3x):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 8(rSTR1)
+ 	lwz	rWORD2, 8(rSTR2)
+-	cmplw	cr5, rWORD1, rWORD2
++#endif
++	cmplw	cr7, rWORD1, rWORD2
+ 	slwi.	r12, rN, 3
+-	bne	cr1, L(dLcr1)
++	bne	cr1, L(dLcr1x)
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 8
+ 	addi	rSTR2, rSTR2, 8
+-	bne	cr6, L(dLcr6)
++#endif
++	bne	cr6, L(dLcr6x)
+ 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+-	bne	cr5, L(dLcr5)
+-        lwz     1,0(1)
++	bne	cr7, L(dLcr7x)
++	addi	1, 1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bne	L(d00)
+ 	li	rRTN, 0
+ 	blr
+ 	
+ /* Count is a multiple of 16, remainder is 0 */
+-	.align 4
++	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dP4):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 0(rSTR1)
+ 	lwz	rWORD2, 0(rSTR2)
+-	cmplw	cr0, rWORD1, rWORD2
++#endif
++	cmplw	cr7, rWORD1, rWORD2
+ L(dP4e):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD3, 4(rSTR1)
+ 	lwz	rWORD4, 4(rSTR2)
++#endif
+ 	cmplw	cr1, rWORD3, rWORD4
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD5, 8(rSTR1)
+ 	lwz	rWORD6, 8(rSTR2)
++#endif
+ 	cmplw	cr6, rWORD5, rWORD6
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwzu	rWORD7, 12(rSTR1)
+ 	lwzu	rWORD8, 12(rSTR2)
++#endif
+ 	cmplw	cr5, rWORD7, rWORD8
+-	bne	cr0, L(dLcr0)
++	bne	cr7, L(dLcr7)
+ 	bne	cr1, L(dLcr1)
+ 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
+ /* This is the primary loop */
+-	.align 4
++	.align	4
+ L(dLoop):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 4(rSTR1)
+ 	lwz	rWORD2, 4(rSTR2)
++#endif
+ 	cmplw	cr1, rWORD3, rWORD4
+ 	bne	cr6, L(dLcr6)
+ L(dLoop1):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD3, 8(rSTR1)
+ 	lwz	rWORD4, 8(rSTR2)
++#endif
+ 	cmplw	cr6, rWORD5, rWORD6
+ 	bne	cr5, L(dLcr5)
+ L(dLoop2):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD5, 12(rSTR1)
+ 	lwz	rWORD6, 12(rSTR2)
++#endif
+ 	cmplw	cr5, rWORD7, rWORD8
+-	bne	cr0, L(dLcr0)
++	bne	cr7, L(dLcr7)
+ L(dLoop3):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwzu	rWORD7, 16(rSTR1)
+ 	lwzu	rWORD8, 16(rSTR2)
++#endif
+ 	bne-	cr1, L(dLcr1)
+-	cmplw	cr0, rWORD1, rWORD2
++	cmplw	cr7, rWORD1, rWORD2
+ 	bdnz+	L(dLoop)	
+ 	
+ L(dL4):
+@@ -327,7 +523,7 @@
+ 	bne	cr5, L(dLcr5)
+ 	cmplw	cr5, rWORD7, rWORD8
+ L(d44):
+-	bne	cr0, L(dLcr0)
++	bne	cr7, L(dLcr7)
+ L(d34):
+ 	bne	cr1, L(dLcr1)
+ L(d24):
+@@ -336,69 +532,82 @@
+ 	slwi.	r12, rN, 3
+ 	bne	cr5, L(dLcr5) 
+ L(d04):
+-        lwz     r30,44(1)
+-        lwz     r31,48(1)
+-        lwz     1,0(1)
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
++	addi	1, 1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+ 	beq	L(zeroLength)
+ /* At this point we have a remainder of 1 to 3 bytes to compare.  Since
+    we are aligned it is safe to load the whole word, and use
+-   shift right to eliminate bits beyond the compare length. */ 
++   shift right to eliminate bits beyond the compare length.  */
+ L(d00):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 4(rSTR1)
+ 	lwz	rWORD2, 4(rSTR2) 
++#endif
+ 	srw	rWORD1, rWORD1, rN
+ 	srw	rWORD2, rWORD2, rN
+-        cmplw   rWORD1,rWORD2
+-        li      rRTN,0
+-        beqlr
+-        li      rRTN,1
+-        bgtlr
+-        li      rRTN,-1
+-        blr
+-
+-	.align 4
+-L(dLcr0):
+-        lwz     r30,44(1)
+-        lwz     r31,48(1)
++	sub	rRTN, rWORD1, rWORD2
++	blr
++
++	.align	4
++	cfi_adjust_cfa_offset(64)
++L(dLcr7):
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
++L(dLcr7x):
+ 	li	rRTN, 1
+-        lwz     1,0(1)
+-	bgtlr	cr0
++	addi	1, 1, 64
++	cfi_adjust_cfa_offset(-64)
++	bgtlr	cr7
+ 	li	rRTN, -1
+ 	blr
+-	.align 4
++	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dLcr1):
+-        lwz     r30,44(1)
+-        lwz     r31,48(1)
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
++L(dLcr1x):
+ 	li	rRTN, 1
+-        lwz     1,0(1)
++	addi	1, 1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bgtlr	cr1
+ 	li	rRTN, -1
+ 	blr
+-	.align 4
++	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dLcr6):
+-        lwz     r30,44(1)
+-        lwz     r31,48(1)
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
++L(dLcr6x):
+ 	li	rRTN, 1
+-        lwz     1,0(1)
++	addi	1, 1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bgtlr	cr6
+ 	li	rRTN, -1
+ 	blr
+-	.align 4
++	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dLcr5):
+-        lwz     r30,44(1)
+-        lwz     r31,48(1)
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
+ L(dLcr5x):
+ 	li	rRTN, 1
+-        lwz     1,0(1)
++	addi	1, 1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bgtlr	cr5
+ 	li	rRTN, -1
+ 	blr
+ 	
+-	.align 4
++	.align	4
+ L(bytealigned):
+-	cfi_adjust_cfa_offset(-64)
+-	mtctr   rN	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	rN	/* Power4 wants mtctr 1st in dispatch group */
+ 
+ /* We need to prime this loop.  This loop is swing modulo scheduled
+    to avoid pipe delays.  The dependent instruction latencies (load to 
+@@ -413,7 +622,7 @@
+ 	lbz	rWORD1, 0(rSTR1)
+ 	lbz	rWORD2, 0(rSTR2)
+ 	bdz-	L(b11)
+-	cmplw	cr0, rWORD1, rWORD2
++	cmplw	cr7, rWORD1, rWORD2
+ 	lbz	rWORD3, 1(rSTR1)
+ 	lbz	rWORD4, 1(rSTR2)
+ 	bdz-	L(b12)
+@@ -421,11 +630,11 @@
+ 	lbzu	rWORD5, 2(rSTR1)
+ 	lbzu	rWORD6, 2(rSTR2)
+ 	bdz-	L(b13)
+-	.align 4
++	.align	4
+ L(bLoop):
+ 	lbzu	rWORD1, 1(rSTR1)
+ 	lbzu	rWORD2, 1(rSTR2)
+-	bne-	cr0, L(bLcr0)
++	bne-	cr7, L(bLcr7)
+ 
+ 	cmplw	cr6, rWORD5, rWORD6
+ 	bdz-	L(b3i)
+@@ -434,7 +643,7 @@
+ 	lbzu	rWORD4, 1(rSTR2)
+ 	bne-	cr1, L(bLcr1)
+ 
+-	cmplw	cr0, rWORD1, rWORD2
++	cmplw	cr7, rWORD1, rWORD2
+ 	bdz-	L(b2i)
+ 
+ 	lbzu	rWORD5, 1(rSTR1)
+@@ -451,23 +660,23 @@
+    tested.  In this case we must complete the pending operations
+    before returning.  */
+ L(b1i):
+-	bne-	cr0, L(bLcr0)
++	bne-	cr7, L(bLcr7)
+ 	bne-	cr1, L(bLcr1)
+ 	b	L(bx56)
+-	.align 4
++	.align	4
+ L(b2i):
+ 	bne-	cr6, L(bLcr6)
+-	bne-	cr0, L(bLcr0)
++	bne-	cr7, L(bLcr7)
+ 	b	L(bx34)
+-	.align 4
++	.align	4
+ L(b3i):
+ 	bne-	cr1, L(bLcr1)
+ 	bne-	cr6, L(bLcr6)
+ 	b	L(bx12)
+-	.align 4
+-L(bLcr0):
++	.align	4
++L(bLcr7):
+ 	li	rRTN, 1
+-	bgtlr	cr0
++	bgtlr	cr7
+ 	li	rRTN, -1
+ 	blr
+ L(bLcr1):
+@@ -482,36 +691,31 @@
+ 	blr
+ 
+ L(b13):
+-	bne-	cr0, L(bx12)
++	bne-	cr7, L(bx12)
+ 	bne-	cr1, L(bx34)
+ L(bx56):
+ 	sub	rRTN, rWORD5, rWORD6
+ 	blr
+ 	nop
+ L(b12):
+-	bne-	cr0, L(bx12)
++	bne-	cr7, L(bx12)
+ L(bx34):	
+ 	sub	rRTN, rWORD3, rWORD4
+ 	blr
+-
+ L(b11):
+ L(bx12):
+ 	sub	rRTN, rWORD1, rWORD2
+ 	blr
+-
+-	.align 4 
+-L(zeroLengthReturn):
+-
++	.align	4
+ L(zeroLength):
+ 	li	rRTN, 0
+ 	blr
+ 
+-	cfi_adjust_cfa_offset(64)
+-	.align 4
++	.align	4
+ /* At this point we know the strings have different alignment and the
+-   compare length is at least 8 bytes.  rBITDIF contains the low order
++   compare length is at least 8 bytes.  r12 contains the low order
+    2 bits of rSTR1 and cr5 contains the result of the logical compare
+-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is word aligned and can 
++   of r12 to 0.  If r12 == 0 then rStr1 is word aligned and can
+    perform the Wunaligned loop.
+   
+    Otherwise we know that rSTR1 is not aready word aligned yet.
+@@ -520,79 +724,88 @@
+    eliminate bits preceeding the first byte.  Since we want to join the
+    normal (Wualigned) compare loop, starting at the second word,
+    we need to adjust the length (rN) and special case the loop
+-   versioning for the first W. This insures that the loop count is
++   versioning for the first W. This ensures that the loop count is
+    correct and the first W (shifted) is in the expected resister pair.  */
+ #define rSHL		r29	/* Unaligned shift left count.  */
+ #define rSHR		r28	/* Unaligned shift right count.  */
+-#define rB		r27	/* Left rotation temp for rWORD2.  */
+-#define rD		r26	/* Left rotation temp for rWORD4.  */
+-#define rF		r25	/* Left rotation temp for rWORD6.  */
+-#define rH		r24	/* Left rotation temp for rWORD8.  */
+-#define rA		r0	/* Right rotation temp for rWORD2.  */
+-#define rC		r12	/* Right rotation temp for rWORD4.  */
+-#define rE		r0	/* Right rotation temp for rWORD6.  */
+-#define rG		r12	/* Right rotation temp for rWORD8.  */
++#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
++#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
++#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
++#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
++	cfi_adjust_cfa_offset(64)
+ L(unaligned):
+-	stw     r29,40(r1)	
+-	cfi_offset(r29,(40-64))	
++	stw	rSHL, 40(r1)
++	cfi_offset(rSHL, (40-64))
+ 	clrlwi	rSHL, rSTR2, 30
+-        stw     r28,36(r1)	
+-	cfi_offset(r28,(36-64))
++	stw	rSHR, 36(r1)
++	cfi_offset(rSHR, (36-64))
+ 	beq	cr5, L(Wunaligned)
+-        stw     r27,32(r1)	
+-	cfi_offset(r27,(32-64))
++	stw	rWORD8_SHIFT, 32(r1)
++	cfi_offset(rWORD8_SHIFT, (32-64))
+ /* Adjust the logical start of rSTR2 to compensate for the extra bits
+    in the 1st rSTR1 W.  */
+-	sub	r27, rSTR2, rBITDIF
++	sub	rWORD8_SHIFT, rSTR2, r12
+ /* But do not attempt to address the W before that W that contains
+    the actual start of rSTR2.  */
+ 	clrrwi	rSTR2, rSTR2, 2
+-        stw     r26,28(r1)	
+-	cfi_offset(r26,(28-64))
+-/* Compute the left/right shift counts for the unalign rSTR2,
++	stw	rWORD2_SHIFT, 28(r1)
++	cfi_offset(rWORD2_SHIFT, (28-64))
++/* Compute the left/right shift counts for the unaligned rSTR2,
+    compensating for the logical (W aligned) start of rSTR1.  */ 
+-	clrlwi	rSHL, r27, 30
++	clrlwi	rSHL, rWORD8_SHIFT, 30
+ 	clrrwi	rSTR1, rSTR1, 2	
+-        stw     r25,24(r1)	
+-	cfi_offset(r25,(24-64))
++	stw	rWORD4_SHIFT, 24(r1)
++	cfi_offset(rWORD4_SHIFT, (24-64))
+ 	slwi	rSHL, rSHL, 3
+-	cmplw	cr5, r27, rSTR2
+-	add	rN, rN, rBITDIF
+-	slwi	r11, rBITDIF, 3
+-        stw     r24,20(r1)	
+-	cfi_offset(r24,(20-64))
++	cmplw	cr5, rWORD8_SHIFT, rSTR2
++	add	rN, rN, r12
++	slwi	rWORD6, r12, 3
++	stw	rWORD6_SHIFT, 20(r1)
++	cfi_offset(rWORD6_SHIFT, (20-64))
+ 	subfic	rSHR, rSHL, 32
+-	srwi	rTMP, rN, 4      /* Divide by 16 */
+-	andi.	rBITDIF, rN, 12  /* Get the W remainder */
++	srwi	r0, rN, 4	/* Divide by 16 */
++	andi.	r12, rN, 12	/* Get the W remainder */
+ /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
+    this special case those bits may be discarded anyway.  Also we
+    must avoid loading a W where none of the bits are part of rSTR2 as
+    this may cross a page boundary and cause a page fault.  */
+ 	li	rWORD8, 0
+ 	blt	cr5, L(dus0)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD8, 0(rSTR2)
+-	la	rSTR2, 4(rSTR2)
++	addi	rSTR2, rSTR2, 4
++#endif
+ 	slw	rWORD8, rWORD8, rSHL
+ 
+ L(dus0):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 0(rSTR1)
+ 	lwz	rWORD2, 0(rSTR2)
+-	cmplwi	cr1, rBITDIF, 8
++#endif
++	cmplwi	cr1, r12, 8
+ 	cmplwi	cr7, rN, 16
+-	srw	rG, rWORD2, rSHR
++	srw	r12, rWORD2, rSHR
+ 	clrlwi	rN, rN, 30
+ 	beq	L(duPs4)
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+-	or	rWORD8, rG, rWORD8
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++	or	rWORD8, r12, rWORD8
+ 	bgt	cr1, L(duPs3)
+ 	beq	cr1, L(duPs2)
+ 
+ /* Remainder is 4 */
+-	.align 4
++	.align	4
+ L(dusP1):
+-	slw	rB, rWORD2, rSHL
+-	slw	rWORD7, rWORD1, r11
+-	slw	rWORD8, rWORD8, r11
++	slw	rWORD8_SHIFT, rWORD2, rSHL
++	slw	rWORD7, rWORD1, rWORD6
++	slw	rWORD8, rWORD8, rWORD6
+ 	bge	cr7, L(duP1e)
+ /* At this point we exit early with the first word compare
+    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
+@@ -602,95 +815,133 @@
+ 	bne	cr5, L(duLcr5)
+ 	cmplw	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA, 0
++	li	r0, 0
+ 	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD2, 4(rSTR2)
+-	srw	rA, rWORD2, rSHR
++#endif
++	srw	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ /* Remainder is 8 */
+-	.align 4
++	.align	4
+ L(duPs2):
+-	slw	rH, rWORD2, rSHL
+-	slw	rWORD5, rWORD1, r11
+-	slw	rWORD6, rWORD8, r11
++	slw	rWORD6_SHIFT, rWORD2, rSHL
++	slw	rWORD5, rWORD1, rWORD6
++	slw	rWORD6, rWORD8, rWORD6
+ 	b	L(duP2e)
+ /* Remainder is 12 */
+-	.align 4
++	.align	4
+ L(duPs3):
+-	slw	rF, rWORD2, rSHL
+-	slw	rWORD3, rWORD1, r11
+-	slw	rWORD4, rWORD8, r11
++	slw	rWORD4_SHIFT, rWORD2, rSHL
++	slw	rWORD3, rWORD1, rWORD6
++	slw	rWORD4, rWORD8, rWORD6
+ 	b	L(duP3e)
+ /* Count is a multiple of 16, remainder is 0 */
+-	.align 4
++	.align	4
+ L(duPs4):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+-	or	rWORD8, rG, rWORD8
+-	slw	rD, rWORD2, rSHL
+-	slw	rWORD1, rWORD1, r11
+-	slw	rWORD2, rWORD8, r11
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++	or	rWORD8, r12, rWORD8
++	slw	rWORD2_SHIFT, rWORD2, rSHL
++	slw	rWORD1, rWORD1, rWORD6
++	slw	rWORD2, rWORD8, rWORD6
+ 	b	L(duP4e)
+ 
+ /* At this point we know rSTR1 is word aligned and the
+    compare length is at least 8 bytes.  */
+-	.align 4
++	.align	4
+ L(Wunaligned):
+-        stw     r27,32(r1)	
+-	cfi_offset(r27,(32-64))
++	stw	rWORD8_SHIFT, 32(r1)
++	cfi_offset(rWORD8_SHIFT, (32-64))
+ 	clrrwi	rSTR2, rSTR2, 2
+-        stw     r26,28(r1)	
+-	cfi_offset(r26,(28-64))
+-	srwi	rTMP, rN, 4	 /* Divide by 16 */
+-        stw     r25,24(r1)	
+-	cfi_offset(r25,(24-64))
+-	andi.	rBITDIF, rN, 12  /* Get the W remainder */
+-        stw     r24,20(r1)	
+-	cfi_offset(r24,(20-64))
++	stw	rWORD2_SHIFT, 28(r1)
++	cfi_offset(rWORD2_SHIFT, (28-64))
++	srwi	r0, rN, 4	/* Divide by 16 */
++	stw	rWORD4_SHIFT, 24(r1)
++	cfi_offset(rWORD4_SHIFT, (24-64))
++	andi.	r12, rN, 12	/* Get the W remainder */
++	stw	rWORD6_SHIFT, 20(r1)
++	cfi_offset(rWORD6_SHIFT, (20-64))
+ 	slwi	rSHL, rSHL, 3
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD6, 0(rSTR2)
+ 	lwzu	rWORD8, 4(rSTR2)
+-	cmplwi	cr1, rBITDIF, 8
++#endif
++	cmplwi	cr1, r12, 8
+ 	cmplwi	cr7, rN, 16
+ 	clrlwi	rN, rN, 30
+ 	subfic	rSHR, rSHL, 32
+-	slw	rH, rWORD6, rSHL
++	slw	rWORD6_SHIFT, rWORD6, rSHL
+ 	beq	L(duP4)
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+ 	bgt	cr1, L(duP3)
+ 	beq	cr1, L(duP2)
+ 		
+ /* Remainder is 4 */
+-	.align 4
++	.align	4
+ L(duP1):
+-	srw	rG, rWORD8, rSHR
++	srw	r12, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	addi	rSTR1, rSTR1, 4
++#else
+ 	lwz	rWORD7, 0(rSTR1)
+-	slw	rB, rWORD8, rSHL
+-	or	rWORD8, rG, rH
++#endif
++	slw	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
+ 	blt	cr7, L(duP1x)
+ L(duP1e):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 4(rSTR1)
+ 	lwz	rWORD2, 4(rSTR2)
++#endif
+ 	cmplw	cr5, rWORD7, rWORD8
+-	srw	rA, rWORD2, rSHR
+-	slw	rD, rWORD2, rSHL
+-	or	rWORD2, rA, rB
++	srw	r0, rWORD2, rSHR
++	slw	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD3, 8(rSTR1)
+ 	lwz	rWORD4, 8(rSTR2)
+-	cmplw	cr0, rWORD1, rWORD2
+-	srw	rC, rWORD4, rSHR
+-	slw	rF, rWORD4, rSHL
++#endif
++	cmplw	cr7, rWORD1, rWORD2
++	srw	r12, rWORD4, rSHR
++	slw	rWORD4_SHIFT, rWORD4, rSHL
+ 	bne	cr5, L(duLcr5)
+-	or	rWORD4, rC, rD
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD5, 12(rSTR1)
+ 	lwz	rWORD6, 12(rSTR2)
++#endif
+ 	cmplw	cr1, rWORD3, rWORD4
+-	srw	rE, rWORD6, rSHR
+-	slw	rH, rWORD6, rSHL
+-	bne	cr0, L(duLcr0)
+-	or	rWORD6, rE, rF
++	srw	r0, rWORD6, rSHR
++	slw	rWORD6_SHIFT, rWORD6, rSHL
++	bne	cr7, L(duLcr7)
++	or	rWORD6, r0, rWORD4_SHIFT
+ 	cmplw	cr6, rWORD5, rWORD6
+ 	b	L(duLoop3)	
+-	.align 4
++	.align	4
+ /* At this point we exit early with the first word compare
+    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
+    how we handle the remaining bytes.  */
+@@ -700,186 +951,321 @@
+ 	bne	cr5, L(duLcr5)
+ 	cmplw	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA, 0
++	li	r0, 0
+ 	ble	cr7, L(dutrim)
+-	ld	rWORD2, 8(rSTR2)
+-	srw	rA, rWORD2, rSHR
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD2, 8(rSTR2)
++#endif
++	srw	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ /* Remainder is 8 */
+-	.align 4
++	.align	4
+ L(duP2):
+-	srw	rE, rWORD8, rSHR
++	srw	r0, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	addi	rSTR1, rSTR1, 4
++#else
+ 	lwz	rWORD5, 0(rSTR1)
+-	or	rWORD6, rE, rH
+-	slw	rH, rWORD8, rSHL
++#endif
++	or	rWORD6, r0, rWORD6_SHIFT
++	slw	rWORD6_SHIFT, rWORD8, rSHL
+ L(duP2e):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD7, 4(rSTR1)
+ 	lwz	rWORD8, 4(rSTR2)
++#endif
+ 	cmplw	cr6, rWORD5, rWORD6
+-	srw	rG, rWORD8, rSHR
+-	slw	rB, rWORD8, rSHL
+-	or	rWORD8, rG, rH
++	srw	r12, rWORD8, rSHR
++	slw	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
+ 	blt	cr7, L(duP2x)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 8(rSTR1)
+ 	lwz	rWORD2, 8(rSTR2)
++#endif
+ 	cmplw	cr5, rWORD7, rWORD8
+ 	bne	cr6, L(duLcr6)
+-	srw	rA, rWORD2, rSHR
+-	slw	rD, rWORD2, rSHL
+-	or	rWORD2, rA, rB
++	srw	r0, rWORD2, rSHR
++	slw	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD3, 12(rSTR1)
+ 	lwz	rWORD4, 12(rSTR2)
+-	cmplw	cr0, rWORD1, rWORD2
++#endif
++	cmplw	cr7, rWORD1, rWORD2
+ 	bne	cr5, L(duLcr5)
+-	srw	rC, rWORD4, rSHR
+-	slw	rF, rWORD4, rSHL
+-	or	rWORD4, rC, rD
++	srw	r12, rWORD4, rSHR
++	slw	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 4
+ 	addi	rSTR2, rSTR2, 4
++#endif
+ 	cmplw	cr1, rWORD3, rWORD4
+ 	b	L(duLoop2)
+-	.align 4
++	.align	4
+ L(duP2x):
+ 	cmplw	cr5, rWORD7, rWORD8
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 4
+ 	addi	rSTR2, rSTR2, 4
++#endif
+ 	bne	cr6, L(duLcr6)
+ 	slwi.	rN, rN, 3
+ 	bne	cr5, L(duLcr5)
+ 	cmplw	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA, 0
++	li	r0, 0
+ 	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD2, 4(rSTR2)
+-	srw	rA, rWORD2, rSHR
++#endif
++	srw	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ 		
+ /* Remainder is 12 */
+-	.align 4
++	.align	4
+ L(duP3):
+-	srw	rC, rWORD8, rSHR
++	srw	r12, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	addi	rSTR1, rSTR1, 4
++#else
+ 	lwz	rWORD3, 0(rSTR1)
+-	slw	rF, rWORD8, rSHL
+-	or	rWORD4, rC, rH
++#endif
++	slw	rWORD4_SHIFT, rWORD8, rSHL
++	or	rWORD4, r12, rWORD6_SHIFT
+ L(duP3e):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD5, 4(rSTR1)
+ 	lwz	rWORD6, 4(rSTR2)
++#endif
+ 	cmplw	cr1, rWORD3, rWORD4
+-	srw	rE, rWORD6, rSHR
+-	slw	rH, rWORD6, rSHL
+-	or	rWORD6, rE, rF
++	srw	r0, rWORD6, rSHR
++	slw	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD7, 8(rSTR1)
+ 	lwz	rWORD8, 8(rSTR2)
++#endif
+ 	cmplw	cr6, rWORD5, rWORD6
+ 	bne	cr1, L(duLcr1)
+-	srw	rG, rWORD8, rSHR
+-	slw	rB, rWORD8, rSHL
+-	or	rWORD8, rG, rH
++	srw	r12, rWORD8, rSHR
++	slw	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
+ 	blt	cr7, L(duP3x)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 12(rSTR1)
+ 	lwz	rWORD2, 12(rSTR2)
++#endif
+ 	cmplw	cr5, rWORD7, rWORD8
+ 	bne	cr6, L(duLcr6)
+-	srw	rA, rWORD2, rSHR
+-	slw	rD, rWORD2, rSHL
+-	or	rWORD2, rA, rB
++	srw	r0, rWORD2, rSHR
++	slw	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 8
+ 	addi	rSTR2, rSTR2, 8
+-	cmplw	cr0, rWORD1, rWORD2
++#endif
++	cmplw	cr7, rWORD1, rWORD2
+ 	b	L(duLoop1)
+-	.align 4
++	.align	4
+ L(duP3x):
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 8
+ 	addi	rSTR2, rSTR2, 8
++#endif
++#if 0
++/* Huh?  We've already branched on cr1!  */
+ 	bne	cr1, L(duLcr1)
++#endif
+ 	cmplw	cr5, rWORD7, rWORD8
+ 	bne	cr6, L(duLcr6)
+ 	slwi.	rN, rN, 3
+ 	bne	cr5, L(duLcr5)
+ 	cmplw	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA, 0
++	li	r0, 0
+ 	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD2, 4(rSTR2)
+-	srw	rA, rWORD2, rSHR
++#endif
++	srw	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ 	
+ /* Count is a multiple of 16, remainder is 0 */
+-	.align 4
++	.align	4
+ L(duP4):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+-	srw	rA, rWORD8, rSHR
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++	srw	r0, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	addi	rSTR1, rSTR1, 4
++#else
+ 	lwz	rWORD1, 0(rSTR1)
+-	slw	rD, rWORD8, rSHL
+-	or	rWORD2, rA, rH
++#endif
++	slw	rWORD2_SHIFT, rWORD8, rSHL
++	or	rWORD2, r0, rWORD6_SHIFT
+ L(duP4e):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD3, 4(rSTR1)
+ 	lwz	rWORD4, 4(rSTR2)
+-	cmplw	cr0, rWORD1, rWORD2
+-	srw	rC, rWORD4, rSHR
+-	slw	rF, rWORD4, rSHL
+-	or	rWORD4, rC, rD
++#endif
++	cmplw	cr7, rWORD1, rWORD2
++	srw	r12, rWORD4, rSHR
++	slw	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD5, 8(rSTR1)
+ 	lwz	rWORD6, 8(rSTR2)
++#endif
+ 	cmplw	cr1, rWORD3, rWORD4
+-	bne	cr0, L(duLcr0)
+-	srw	rE, rWORD6, rSHR
+-	slw	rH, rWORD6, rSHL
+-	or	rWORD6, rE, rF
++	bne	cr7, L(duLcr7)
++	srw	r0, rWORD6, rSHR
++	slw	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwzu	rWORD7, 12(rSTR1)
+ 	lwzu	rWORD8, 12(rSTR2)
++#endif
+ 	cmplw	cr6, rWORD5, rWORD6
+ 	bne	cr1, L(duLcr1)
+-	srw	rG, rWORD8, rSHR
+-	slw	rB, rWORD8, rSHL
+-	or	rWORD8, rG, rH
++	srw	r12, rWORD8, rSHR
++	slw	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
+ 	cmplw	cr5, rWORD7, rWORD8
+ 	bdz-	L(du24)		/* Adjust CTR as we start with +4 */
+ /* This is the primary loop */
+-	.align 4
++	.align	4
+ L(duLoop):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD1, 4(rSTR1)
+ 	lwz	rWORD2, 4(rSTR2)
++#endif
+ 	cmplw	cr1, rWORD3, rWORD4
+ 	bne	cr6, L(duLcr6)
+-	srw	rA, rWORD2, rSHR
+-	slw	rD, rWORD2, rSHL
+-	or	rWORD2, rA, rB
++	srw	r0, rWORD2, rSHR
++	slw	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
+ L(duLoop1):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD3, 8(rSTR1)
+ 	lwz	rWORD4, 8(rSTR2)
++#endif
+ 	cmplw	cr6, rWORD5, rWORD6
+ 	bne	cr5, L(duLcr5)
+-	srw	rC, rWORD4, rSHR
+-	slw	rF, rWORD4, rSHL
+-	or	rWORD4, rC, rD
++	srw	r12, rWORD4, rSHR
++	slw	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
+ L(duLoop2):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD5, 12(rSTR1)
+ 	lwz	rWORD6, 12(rSTR2)
++#endif
+ 	cmplw	cr5, rWORD7, rWORD8
+-	bne	cr0, L(duLcr0)
+-	srw	rE, rWORD6, rSHR
+-	slw	rH, rWORD6, rSHL
+-	or	rWORD6, rE, rF
++	bne	cr7, L(duLcr7)
++	srw	r0, rWORD6, rSHR
++	slw	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
+ L(duLoop3):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwzu	rWORD7, 16(rSTR1)
+ 	lwzu	rWORD8, 16(rSTR2)
+-	cmplw	cr0, rWORD1, rWORD2
++#endif
++	cmplw	cr7, rWORD1, rWORD2
+ 	bne-	cr1, L(duLcr1)
+-	srw	rG, rWORD8, rSHR
+-	slw	rB, rWORD8, rSHL
+-	or	rWORD8, rG, rH
++	srw	r12, rWORD8, rSHR
++	slw	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
+ 	bdnz+	L(duLoop)	
+ 	
+ L(duL4):
++#if 0
++/* Huh?  We've already branched on cr1!  */
+ 	bne	cr1, L(duLcr1)
++#endif
+ 	cmplw	cr1, rWORD3, rWORD4
+ 	bne	cr6, L(duLcr6)
+ 	cmplw	cr6, rWORD5, rWORD6
+ 	bne	cr5, L(duLcr5)
+ 	cmplw	cr5, rWORD7, rWORD8
+ L(du44):
+-	bne	cr0, L(duLcr0)
++	bne	cr7, L(duLcr7)
+ L(du34):
+ 	bne	cr1, L(duLcr1)
+ L(du24):
+@@ -889,95 +1275,101 @@
+ 	bne	cr5, L(duLcr5)
+ /* At this point we have a remainder of 1 to 3 bytes to compare.  We use
+    shift right to eliminate bits beyond the compare length. 
++   This allows the use of word subtract to compute the final result.
+ 
+    However it may not be safe to load rWORD2 which may be beyond the 
+    string length. So we compare the bit length of the remainder to
+    the right shift count (rSHR). If the bit count is less than or equal
+    we do not need to load rWORD2 (all significant bits are already in
+-   rB).  */
++   rWORD8_SHIFT).  */
+ 	cmplw	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA, 0
++	li	r0, 0
+ 	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
+ 	lwz	rWORD2, 4(rSTR2)
+-	srw	rA, rWORD2, rSHR
+-	.align 4
++#endif
++	srw	r0, rWORD2, rSHR
++	.align	4
+ L(dutrim):
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++#else
+ 	lwz	rWORD1, 4(rSTR1)
+-        lwz     r31,48(1)
++#endif
++	lwz	rWORD8, 48(r1)
+ 	subfic	rN, rN, 32	/* Shift count is 32 - (rN * 8).  */ 
+-	or	rWORD2, rA, rB
+-        lwz     r30,44(1)
+-        lwz     r29,40(r1)
++	or	rWORD2, r0, rWORD8_SHIFT
++	lwz	rWORD7, 44(r1)
++	lwz	rSHL, 40(r1)
+ 	srw	rWORD1, rWORD1, rN
+ 	srw	rWORD2, rWORD2, rN
+-        lwz     r28,36(r1)	
+-        lwz     r27,32(r1)
+-        cmplw   rWORD1,rWORD2
+-        li      rRTN,0
+-        beq     L(dureturn26)
+-        li      rRTN,1
+-        bgt     L(dureturn26)
+-        li      rRTN,-1
+-	b    L(dureturn26)
+-	.align 4
+-L(duLcr0):
+-        lwz     r31,48(1)
+-        lwz     r30,44(1)
+-	li	rRTN, 1
+-	bgt	cr0, L(dureturn29)	
+-	lwz     r29,40(r1)
+-        lwz     r28,36(r1)	
++	lwz	rSHR, 36(r1)
++	lwz	rWORD8_SHIFT, 32(r1)
++	sub	rRTN, rWORD1, rWORD2
++	b	L(dureturn26)
++	.align	4
++L(duLcr7):
++	lwz	rWORD8, 48(r1)
++	lwz	rWORD7, 44(r1)
++	li	rRTN, 1
++	bgt	cr7, L(dureturn29)
++	lwz	rSHL, 40(r1)
++	lwz	rSHR, 36(r1)
+ 	li	rRTN, -1
+ 	b	L(dureturn27)
+-	.align 4
++	.align	4
+ L(duLcr1):
+-        lwz     r31,48(1)
+-        lwz     r30,44(1)
++	lwz	rWORD8, 48(r1)
++	lwz	rWORD7, 44(r1)
+ 	li	rRTN, 1
+ 	bgt	cr1, L(dureturn29)	
+-        lwz     r29,40(r1)
+-        lwz     r28,36(r1)	
++	lwz	rSHL, 40(r1)
++	lwz	rSHR, 36(r1)
+ 	li	rRTN, -1
+ 	b	L(dureturn27)
+-	.align 4
++	.align	4
+ L(duLcr6):
+-        lwz     r31,48(1)
+-        lwz     r30,44(1)
++	lwz	rWORD8, 48(r1)
++	lwz	rWORD7, 44(r1)
+ 	li	rRTN, 1
+ 	bgt	cr6, L(dureturn29)	
+-        lwz     r29,40(r1)
+-        lwz     r28,36(r1)	
++	lwz	rSHL, 40(r1)
++	lwz	rSHR, 36(r1)
+ 	li	rRTN, -1
+ 	b	L(dureturn27)
+-	.align 4
++	.align	4
+ L(duLcr5):
+-        lwz     r31,48(1)
+-        lwz     r30,44(1)
++	lwz	rWORD8, 48(r1)
++	lwz	rWORD7, 44(r1)
+ 	li	rRTN, 1
+ 	bgt	cr5, L(dureturn29)	
+-        lwz     r29,40(r1)
+-        lwz     r28,36(r1)	
++	lwz	rSHL, 40(r1)
++	lwz	rSHR, 36(r1)
+ 	li	rRTN, -1
+ 	b	L(dureturn27)
+ 	.align	3
+ L(duZeroReturn):
+-	li	rRTN,0
++	li	rRTN, 0
+ 	.align	4
+ L(dureturn):
+-        lwz     r31,48(1)
+-        lwz     r30,44(1)
++	lwz	rWORD8, 48(r1)
++	lwz	rWORD7, 44(r1)
+ L(dureturn29):	
+-        lwz     r29,40(r1)
+-        lwz     r28,36(r1)	
++	lwz	rSHL, 40(r1)
++	lwz	rSHR, 36(r1)
+ L(dureturn27):	
+-        lwz     r27,32(r1)
++	lwz	rWORD8_SHIFT, 32(r1)
+ L(dureturn26):	
+-        lwz     r26,28(r1)
++	lwz	rWORD2_SHIFT, 28(r1)
+ L(dureturn25):	
+-        lwz     r25,24(r1)
+-        lwz     r24,20(r1)
+-        lwz     1,0(1)
++	lwz	rWORD4_SHIFT, 24(r1)
++	lwz	rWORD6_SHIFT, 20(r1)
++	addi	1, 1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	blr
+ END (BP_SYM (memcmp))
+ 
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcmp.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcmp.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcmp.S	2014-05-28 19:22:37.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/memcmp.S	2014-05-28 21:44:57.000000000 -0500
+@@ -25,10 +25,9 @@
+ 		    size_t size [r5])  */
+ 
+ 	.machine power7
+-EALIGN (BP_SYM(memcmp),4,0)
++EALIGN (BP_SYM(memcmp), 4, 0)
+ 	CALL_MCOUNT
+ 
+-#define rTMP	r0
+ #define rRTN	r3
+ #define rSTR1	r3	/* first string arg */
+ #define rSTR2	r4	/* second string arg */
+@@ -39,35 +38,32 @@
+ #define rWORD4	r9	/* next word in s2 */
+ #define rWORD5	r10	/* next word in s1 */
+ #define rWORD6	r11	/* next word in s2 */
+-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
+ #define rWORD7	r30	/* next word in s1 */
+ #define rWORD8	r31	/* next word in s2 */
+ 
+-	xor	rTMP,rSTR2,rSTR1
+-	cmplwi	cr6,rN,0
+-	cmplwi	cr1,rN,12
+-	clrlwi.	rTMP,rTMP,30
+-	clrlwi	rBITDIF,rSTR1,30
+-	cmplwi	cr5,rBITDIF,0
+-	beq-	cr6,L(zeroLength)
+-	dcbt	0,rSTR1
+-	dcbt	0,rSTR2
+-
+-	/* If less than 8 bytes or not aligned, use the unaligned
+-	   byte loop.  */
+-
+-	blt	cr1,L(bytealigned)
+-	stwu	1,-64(1)
++	xor	r0, rSTR2, rSTR1
++	cmplwi	cr6, rN, 0
++	cmplwi	cr1, rN, 12
++	clrlwi.	r0, r0, 30
++	clrlwi	r12, rSTR1, 30
++	cmplwi	cr5, r12, 0
++	beq-	cr6, L(zeroLength)
++	dcbt	0, rSTR1
++	dcbt	0, rSTR2
++/* If less than 8 bytes or not aligned, use the unaligned
++   byte loop.  */
++	blt	cr1, L(bytealigned)
++	stwu	1, -64(r1)
+ 	cfi_adjust_cfa_offset(64)
+-	stw	r31,48(1)
+-	cfi_offset(31,(48-64))
+-	stw	r30,44(1)
+-	cfi_offset(30,(44-64))
++	stw	rWORD8, 48(r1)
++	cfi_offset(rWORD8, (48-64))
++	stw	rWORD7, 44(r1)
++	cfi_offset(rWORD7, (44-64))
+ 	bne	L(unaligned)
+ /* At this point we know both strings have the same alignment and the
+-   compare length is at least 8 bytes.  rBITDIF contains the low order
++   compare length is at least 8 bytes.  r12 contains the low order
+    2 bits of rSTR1 and cr5 contains the result of the logical compare
+-   of rBITDIF to 0.  If rBITDIF == 0 then we are already word
++   of r12 to 0.  If r12 == 0 then we are already word
+    aligned and can perform the word aligned loop.
+ 
+    Otherwise we know the two strings have the same alignment (but not
+@@ -76,332 +72,541 @@
+    eliminate bits preceeding the first byte.  Since we want to join the
+    normal (word aligned) compare loop, starting at the second word,
+    we need to adjust the length (rN) and special case the loop
+-   versioning for the first word. This insures that the loop count is
++   versioning for the first word. This ensures that the loop count is
+    correct and the first word (shifted) is in the expected register pair. */
+ 	.align	4
+ L(samealignment):
+-	clrrwi	rSTR1,rSTR1,2
+-	clrrwi	rSTR2,rSTR2,2
+-	beq	cr5,L(Waligned)
+-	add	rN,rN,rBITDIF
+-	slwi	r11,rBITDIF,3
+-	srwi	rTMP,rN,4	/* Divide by 16 */
+-	andi.	rBITDIF,rN,12	/* Get the word remainder */
+-	lwz	rWORD1,0(rSTR1)
+-	lwz	rWORD2,0(rSTR2)
+-	cmplwi	cr1,rBITDIF,8
+-	cmplwi	cr7,rN,16
+-	clrlwi	rN,rN,30
++	clrrwi	rSTR1, rSTR1, 2
++	clrrwi	rSTR2, rSTR2, 2
++	beq	cr5, L(Waligned)
++	add	rN, rN, r12
++	slwi	rWORD6, r12, 3
++	srwi	r0, rN, 4	/* Divide by 16 */
++	andi.	r12, rN, 12	/* Get the word remainder */
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 0(rSTR1)
++	lwz	rWORD2, 0(rSTR2)
++#endif
++	cmplwi	cr1, r12, 8
++	cmplwi	cr7, rN, 16
++	clrlwi	rN, rN, 30
+ 	beq	L(dPs4)
+-	mtctr	rTMP
+-	bgt	cr1,L(dPs3)
+-	beq	cr1,L(dPs2)
++	mtctr	r0
++	bgt	cr1, L(dPs3)
++	beq	cr1, L(dPs2)
+ 
+ /* Remainder is 4 */
+ 	.align	3
+ L(dsP1):
+-	slw	rWORD5,rWORD1,r11
+-	slw	rWORD6,rWORD2,r11
+-	cmplw	cr5,rWORD5,rWORD6
+-	blt	cr7,L(dP1x)
++	slw	rWORD5, rWORD1, rWORD6
++	slw	rWORD6, rWORD2, rWORD6
++	cmplw	cr5, rWORD5, rWORD6
++	blt	cr7, L(dP1x)
+ /* Do something useful in this cycle since we have to branch anyway.  */
+-	lwz	rWORD1,4(rSTR1)
+-	lwz	rWORD2,4(rSTR2)
+-	cmplw	cr0,rWORD1,rWORD2
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 4(rSTR1)
++	lwz	rWORD2, 4(rSTR2)
++#endif
++	cmplw	cr7, rWORD1, rWORD2
+ 	b	L(dP1e)
+ /* Remainder is 8 */
+ 	.align	4
+ L(dPs2):
+-	slw	rWORD5,rWORD1,r11
+-	slw	rWORD6,rWORD2,r11
+-	cmplw	cr6,rWORD5,rWORD6
+-	blt	cr7,L(dP2x)
++	slw	rWORD5, rWORD1, rWORD6
++	slw	rWORD6, rWORD2, rWORD6
++	cmplw	cr6, rWORD5, rWORD6
++	blt	cr7, L(dP2x)
+ /* Do something useful in this cycle since we have to branch anyway.  */
+-	lwz	rWORD7,4(rSTR1)
+-	lwz	rWORD8,4(rSTR2)
+-	cmplw	cr5,rWORD7,rWORD8
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD7, 4(rSTR1)
++	lwz	rWORD8, 4(rSTR2)
++#endif
++	cmplw	cr5, rWORD7, rWORD8
+ 	b	L(dP2e)
+ /* Remainder is 12 */
+ 	.align	4
+ L(dPs3):
+-	slw	rWORD3,rWORD1,r11
+-	slw	rWORD4,rWORD2,r11
+-	cmplw	cr1,rWORD3,rWORD4
++	slw	rWORD3, rWORD1, rWORD6
++	slw	rWORD4, rWORD2, rWORD6
++	cmplw	cr1, rWORD3, rWORD4
+ 	b	L(dP3e)
+ /* Count is a multiple of 16, remainder is 0 */
+ 	.align	4
+ L(dPs4):
+-	mtctr	rTMP
+-	slw	rWORD1,rWORD1,r11
+-	slw	rWORD2,rWORD2,r11
+-	cmplw	cr0,rWORD1,rWORD2
++	mtctr	r0
++	slw	rWORD1, rWORD1, rWORD6
++	slw	rWORD2, rWORD2, rWORD6
++	cmplw	cr7, rWORD1, rWORD2
+ 	b	L(dP4e)
+ 
+ /* At this point we know both strings are word aligned and the
+    compare length is at least 8 bytes.  */
+ 	.align	4
+ L(Waligned):
+-	andi.	rBITDIF,rN,12	/* Get the word remainder */
+-	srwi	rTMP,rN,4	/* Divide by 16 */
+-	cmplwi	cr1,rBITDIF,8
+-	cmplwi	cr7,rN,16
+-	clrlwi	rN,rN,30
++	andi.	r12, rN, 12	/* Get the word remainder */
++	srwi	r0, rN, 4	/* Divide by 16 */
++	cmplwi	cr1, r12, 8
++	cmplwi	cr7, rN, 16
++	clrlwi	rN, rN, 30
+ 	beq	L(dP4)
+-	bgt	cr1,L(dP3)
+-	beq	cr1,L(dP2)
++	bgt	cr1, L(dP3)
++	beq	cr1, L(dP2)
+ 
+ /* Remainder is 4 */
+ 	.align	4
+ L(dP1):
+-	mtctr	rTMP
++	mtctr	r0
+ /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+    (8-15 byte compare), we want to use only volatile registers.  This
+    means we can avoid restoring non-volatile registers since we did not
+    change any on the early exit path.  The key here is the non-early
+    exit path only cares about the condition code (cr5), not about which
+    register pair was used.  */
+-	lwz	rWORD5,0(rSTR1)
+-	lwz	rWORD6,0(rSTR2)
+-	cmplw	cr5,rWORD5,rWORD6
+-	blt	cr7,L(dP1x)
+-	lwz	rWORD1,4(rSTR1)
+-	lwz	rWORD2,4(rSTR2)
+-	cmplw	cr0,rWORD1,rWORD2
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD5, 0(rSTR1)
++	lwz	rWORD6, 0(rSTR2)
++#endif
++	cmplw	cr5, rWORD5, rWORD6
++	blt	cr7, L(dP1x)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 4(rSTR1)
++	lwz	rWORD2, 4(rSTR2)
++#endif
++	cmplw	cr7, rWORD1, rWORD2
+ L(dP1e):
+-	lwz	rWORD3,8(rSTR1)
+-	lwz	rWORD4,8(rSTR2)
+-	cmplw	cr1,rWORD3,rWORD4
+-	lwz	rWORD5,12(rSTR1)
+-	lwz	rWORD6,12(rSTR2)
+-	cmplw	cr6,rWORD5,rWORD6
+-	bne	cr5,L(dLcr5)
+-	bne	cr0,L(dLcr0)
+-
+-	lwzu	rWORD7,16(rSTR1)
+-	lwzu	rWORD8,16(rSTR2)
+-	bne	cr1,L(dLcr1)
+-	cmplw	cr5,rWORD7,rWORD8
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD3, 8(rSTR1)
++	lwz	rWORD4, 8(rSTR2)
++#endif
++	cmplw	cr1, rWORD3, rWORD4
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD5, 12(rSTR1)
++	lwz	rWORD6, 12(rSTR2)
++#endif
++	cmplw	cr6, rWORD5, rWORD6
++	bne	cr5, L(dLcr5x)
++	bne	cr7, L(dLcr7x)
++
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwzu	rWORD7, 16(rSTR1)
++	lwzu	rWORD8, 16(rSTR2)
++#endif
++	bne	cr1, L(dLcr1)
++	cmplw	cr5, rWORD7, rWORD8
+ 	bdnz	L(dLoop)
+-	bne	cr6,L(dLcr6)
+-	lwz	r30,44(1)
+-	lwz	r31,48(1)
++	bne	cr6, L(dLcr6)
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
+ 	.align	3
+ L(dP1x):
+-	slwi.	r12,rN,3
+-	bne	cr5,L(dLcr5)
+-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
+-	lwz	1,0(1)
++	slwi.	r12, rN, 3
++	bne	cr5, L(dLcr5x)
++	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
++	addi	r1, r1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bne	L(d00)
+-	li	rRTN,0
++	li	rRTN, 0
+ 	blr
+ 
+ /* Remainder is 8 */
+ 	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dP2):
+-	mtctr	rTMP
+-	lwz	rWORD5,0(rSTR1)
+-	lwz	rWORD6,0(rSTR2)
+-	cmplw	cr6,rWORD5,rWORD6
+-	blt	cr7,L(dP2x)
+-	lwz	rWORD7,4(rSTR1)
+-	lwz	rWORD8,4(rSTR2)
+-	cmplw	cr5,rWORD7,rWORD8
++	mtctr	r0
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD5, 0(rSTR1)
++	lwz	rWORD6, 0(rSTR2)
++#endif
++	cmplw	cr6, rWORD5, rWORD6
++	blt	cr7, L(dP2x)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD7, 4(rSTR1)
++	lwz	rWORD8, 4(rSTR2)
++#endif
++	cmplw	cr5, rWORD7, rWORD8
+ L(dP2e):
+-	lwz	rWORD1,8(rSTR1)
+-	lwz	rWORD2,8(rSTR2)
+-	cmplw	cr0,rWORD1,rWORD2
+-	lwz	rWORD3,12(rSTR1)
+-	lwz	rWORD4,12(rSTR2)
+-	cmplw	cr1,rWORD3,rWORD4
+-	addi	rSTR1,rSTR1,4
+-	addi	rSTR2,rSTR2,4
+-	bne	cr6,L(dLcr6)
+-	bne	cr5,L(dLcr5)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 8(rSTR1)
++	lwz	rWORD2, 8(rSTR2)
++#endif
++	cmplw	cr7, rWORD1, rWORD2
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD3, 12(rSTR1)
++	lwz	rWORD4, 12(rSTR2)
++#endif
++	cmplw	cr1, rWORD3, rWORD4
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#endif
++	bne	cr6, L(dLcr6)
++	bne	cr5, L(dLcr5)
+ 	b	L(dLoop2)
+ /* Again we are on a early exit path (16-23 byte compare), we want to
+    only use volatile registers and avoid restoring non-volatile
+    registers.  */
+ 	.align	4
+ L(dP2x):
+-	lwz	rWORD3,4(rSTR1)
+-	lwz	rWORD4,4(rSTR2)
+-	cmplw	cr5,rWORD3,rWORD4
+-	slwi.	r12,rN,3
+-	bne	cr6,L(dLcr6)
+-	addi	rSTR1,rSTR1,4
+-	addi	rSTR2,rSTR2,4
+-	bne	cr5,L(dLcr5)
+-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
+-	lwz	1,0(1)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD3, 4(rSTR1)
++	lwz	rWORD4, 4(rSTR2)
++#endif
++	cmplw	cr1, rWORD3, rWORD4
++	slwi.	r12, rN, 3
++	bne	cr6, L(dLcr6x)
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#endif
++	bne	cr1, L(dLcr1x)
++	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
++	addi	r1, r1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bne	L(d00)
+-	li	rRTN,0
++	li	rRTN, 0
+ 	blr
+ 
+ /* Remainder is 12 */
+ 	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dP3):
+-	mtctr	rTMP
+-	lwz	rWORD3,0(rSTR1)
+-	lwz	rWORD4,0(rSTR2)
+-	cmplw	cr1,rWORD3,rWORD4
++	mtctr	r0
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD3, 0(rSTR1)
++	lwz	rWORD4, 0(rSTR2)
++#endif
++	cmplw	cr1, rWORD3, rWORD4
+ L(dP3e):
+-	lwz	rWORD5,4(rSTR1)
+-	lwz	rWORD6,4(rSTR2)
+-	cmplw	cr6,rWORD5,rWORD6
+-	blt	cr7,L(dP3x)
+-	lwz	rWORD7,8(rSTR1)
+-	lwz	rWORD8,8(rSTR2)
+-	cmplw	cr5,rWORD7,rWORD8
+-	lwz	rWORD1,12(rSTR1)
+-	lwz	rWORD2,12(rSTR2)
+-	cmplw	cr0,rWORD1,rWORD2
+-	addi	rSTR1,rSTR1,8
+-	addi	rSTR2,rSTR2,8
+-	bne	cr1,L(dLcr1)
+-	bne	cr6,L(dLcr6)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD5, 4(rSTR1)
++	lwz	rWORD6, 4(rSTR2)
++#endif
++	cmplw	cr6, rWORD5, rWORD6
++	blt	cr7, L(dP3x)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD7, 8(rSTR1)
++	lwz	rWORD8, 8(rSTR2)
++#endif
++	cmplw	cr5, rWORD7, rWORD8
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 12(rSTR1)
++	lwz	rWORD2, 12(rSTR2)
++#endif
++	cmplw	cr7, rWORD1, rWORD2
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#endif
++	bne	cr1, L(dLcr1)
++	bne	cr6, L(dLcr6)
+ 	b	L(dLoop1)
+ /* Again we are on a early exit path (24-31 byte compare), we want to
+    only use volatile registers and avoid restoring non-volatile
+    registers.  */
+ 	.align	4
+ L(dP3x):
+-	lwz	rWORD1,8(rSTR1)
+-	lwz	rWORD2,8(rSTR2)
+-	cmplw	cr5,rWORD1,rWORD2
+-	slwi.	r12,rN,3
+-	bne	cr1,L(dLcr1)
+-	addi	rSTR1,rSTR1,8
+-	addi	rSTR2,rSTR2,8
+-	bne	cr6,L(dLcr6)
+-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
+-	bne	cr5,L(dLcr5)
+-	lwz	1,0(1)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 8(rSTR1)
++	lwz	rWORD2, 8(rSTR2)
++#endif
++	cmplw	cr7, rWORD1, rWORD2
++	slwi.	r12, rN, 3
++	bne	cr1, L(dLcr1x)
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#endif
++	bne	cr6, L(dLcr6x)
++	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
++	bne	cr7, L(dLcr7x)
++	addi	r1, r1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bne	L(d00)
+-	li	rRTN,0
++	li	rRTN, 0
+ 	blr
+ 
+ /* Count is a multiple of 16, remainder is 0 */
+ 	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dP4):
+-	mtctr	rTMP
+-	lwz	rWORD1,0(rSTR1)
+-	lwz	rWORD2,0(rSTR2)
+-	cmplw	cr0,rWORD1,rWORD2
++	mtctr	r0
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 0(rSTR1)
++	lwz	rWORD2, 0(rSTR2)
++#endif
++	cmplw	cr7, rWORD1, rWORD2
+ L(dP4e):
+-	lwz	rWORD3,4(rSTR1)
+-	lwz	rWORD4,4(rSTR2)
+-	cmplw	cr1,rWORD3,rWORD4
+-	lwz	rWORD5,8(rSTR1)
+-	lwz	rWORD6,8(rSTR2)
+-	cmplw	cr6,rWORD5,rWORD6
+-	lwzu	rWORD7,12(rSTR1)
+-	lwzu	rWORD8,12(rSTR2)
+-	cmplw	cr5,rWORD7,rWORD8
+-	bne	cr0,L(dLcr0)
+-	bne	cr1,L(dLcr1)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD3, 4(rSTR1)
++	lwz	rWORD4, 4(rSTR2)
++#endif
++	cmplw	cr1, rWORD3, rWORD4
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD5, 8(rSTR1)
++	lwz	rWORD6, 8(rSTR2)
++#endif
++	cmplw	cr6, rWORD5, rWORD6
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwzu	rWORD7, 12(rSTR1)
++	lwzu	rWORD8, 12(rSTR2)
++#endif
++	cmplw	cr5, rWORD7, rWORD8
++	bne	cr7, L(dLcr7)
++	bne	cr1, L(dLcr1)
+ 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
+ /* This is the primary loop */
+ 	.align	4
+ L(dLoop):
+-	lwz	rWORD1,4(rSTR1)
+-	lwz	rWORD2,4(rSTR2)
+-	cmplw	cr1,rWORD3,rWORD4
+-	bne	cr6,L(dLcr6)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 4(rSTR1)
++	lwz	rWORD2, 4(rSTR2)
++#endif
++	cmplw	cr1, rWORD3, rWORD4
++	bne	cr6, L(dLcr6)
+ L(dLoop1):
+-	lwz	rWORD3,8(rSTR1)
+-	lwz	rWORD4,8(rSTR2)
+-	cmplw	cr6,rWORD5,rWORD6
+-	bne	cr5,L(dLcr5)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD3, 8(rSTR1)
++	lwz	rWORD4, 8(rSTR2)
++#endif
++	cmplw	cr6, rWORD5, rWORD6
++	bne	cr5, L(dLcr5)
+ L(dLoop2):
+-	lwz	rWORD5,12(rSTR1)
+-	lwz	rWORD6,12(rSTR2)
+-	cmplw	cr5,rWORD7,rWORD8
+-	bne	cr0,L(dLcr0)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD5, 12(rSTR1)
++	lwz	rWORD6, 12(rSTR2)
++#endif
++	cmplw	cr5, rWORD7, rWORD8
++	bne	cr7, L(dLcr7)
+ L(dLoop3):
+-	lwzu	rWORD7,16(rSTR1)
+-	lwzu	rWORD8,16(rSTR2)
+-	bne	cr1,L(dLcr1)
+-	cmplw	cr0,rWORD1,rWORD2
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwzu	rWORD7, 16(rSTR1)
++	lwzu	rWORD8, 16(rSTR2)
++#endif
++	bne	cr1, L(dLcr1)
++	cmplw	cr7, rWORD1, rWORD2
+ 	bdnz	L(dLoop)
+ 
+ L(dL4):
+-	cmplw	cr1,rWORD3,rWORD4
+-	bne	cr6,L(dLcr6)
+-	cmplw	cr6,rWORD5,rWORD6
+-	bne	cr5,L(dLcr5)
+-	cmplw	cr5,rWORD7,rWORD8
++	cmplw	cr1, rWORD3, rWORD4
++	bne	cr6, L(dLcr6)
++	cmplw	cr6, rWORD5, rWORD6
++	bne	cr5, L(dLcr5)
++	cmplw	cr5, rWORD7, rWORD8
+ L(d44):
+-	bne	cr0,L(dLcr0)
++	bne	cr7, L(dLcr7)
+ L(d34):
+-	bne	cr1,L(dLcr1)
++	bne	cr1, L(dLcr1)
+ L(d24):
+-	bne	cr6,L(dLcr6)
++	bne	cr6, L(dLcr6)
+ L(d14):
+-	slwi.	r12,rN,3
+-	bne	cr5,L(dLcr5)
++	slwi.	r12, rN, 3
++	bne	cr5, L(dLcr5)
+ L(d04):
+-	lwz	r30,44(1)
+-	lwz	r31,48(1)
+-	lwz	1,0(1)
+-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
++	addi	r1, r1, 64
++	cfi_adjust_cfa_offset(-64)
++	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+ 	beq	L(zeroLength)
+ /* At this point we have a remainder of 1 to 3 bytes to compare.  Since
+    we are aligned it is safe to load the whole word, and use
+-   shift right to eliminate bits beyond the compare length. */
++   shift right to eliminate bits beyond the compare length.  */
+ L(d00):
+-	lwz	rWORD1,4(rSTR1)
+-	lwz	rWORD2,4(rSTR2)
+-	srw	rWORD1,rWORD1,rN
+-	srw	rWORD2,rWORD2,rN
+-	cmplw	rWORD1,rWORD2
+-	li	rRTN,0
+-	beqlr
+-	li	rRTN,1
+-	bgtlr
+-	li	rRTN,-1
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 4(rSTR1)
++	lwz	rWORD2, 4(rSTR2)
++#endif
++	srw	rWORD1, rWORD1, rN
++	srw	rWORD2, rWORD2, rN
++	sub	rRTN, rWORD1, rWORD2
+ 	blr
+ 
+ 	.align	4
+-L(dLcr0):
+-	lwz	r30,44(1)
+-	lwz	r31,48(1)
+-	li	rRTN,1
+-	lwz	1,0(1)
+-	bgtlr	cr0
+-	li	rRTN,-1
++	cfi_adjust_cfa_offset(64)
++L(dLcr7):
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
++L(dLcr7x):
++	li	rRTN, 1
++	addi	r1, r1, 64
++	cfi_adjust_cfa_offset(-64)
++	bgtlr	cr7
++	li	rRTN, -1
+ 	blr
+ 	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dLcr1):
+-	lwz	r30,44(1)
+-	lwz	r31,48(1)
+-	li	rRTN,1
+-	lwz	1,0(1)
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
++L(dLcr1x):
++	li	rRTN, 1
++	addi	r1, r1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bgtlr	cr1
+-	li	rRTN,-1
++	li	rRTN, -1
+ 	blr
+ 	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dLcr6):
+-	lwz	r30,44(1)
+-	lwz	r31,48(1)
+-	li	rRTN,1
+-	lwz	1,0(1)
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
++L(dLcr6x):
++	li	rRTN, 1
++	addi	r1, r1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bgtlr	cr6
+-	li	rRTN,-1
++	li	rRTN, -1
+ 	blr
+ 	.align	4
++	cfi_adjust_cfa_offset(64)
+ L(dLcr5):
+-	lwz	r30,44(1)
+-	lwz	r31,48(1)
++	lwz	rWORD7, 44(r1)
++	lwz	rWORD8, 48(r1)
+ L(dLcr5x):
+-	li	rRTN,1
+-	lwz	1,0(1)
++	li	rRTN, 1
++	addi	r1, r1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	bgtlr	cr5
+-	li	rRTN,-1
++	li	rRTN, -1
+ 	blr
+ 
+ 	.align	4
+ L(bytealigned):
+-	cfi_adjust_cfa_offset(-64)
+ 	mtctr	rN
+ 
+ /* We need to prime this loop.  This loop is swing modulo scheduled
+@@ -413,38 +618,39 @@
+ 
+    So we must precondition some registers and condition codes so that
+    we don't exit the loop early on the first iteration.  */
+-	lbz	rWORD1,0(rSTR1)
+-	lbz	rWORD2,0(rSTR2)
++
++	lbz	rWORD1, 0(rSTR1)
++	lbz	rWORD2, 0(rSTR2)
+ 	bdz	L(b11)
+-	cmplw	cr0,rWORD1,rWORD2
+-	lbz	rWORD3,1(rSTR1)
+-	lbz	rWORD4,1(rSTR2)
++	cmplw	cr7, rWORD1, rWORD2
++	lbz	rWORD3, 1(rSTR1)
++	lbz	rWORD4, 1(rSTR2)
+ 	bdz	L(b12)
+-	cmplw	cr1,rWORD3,rWORD4
+-	lbzu	rWORD5,2(rSTR1)
+-	lbzu	rWORD6,2(rSTR2)
++	cmplw	cr1, rWORD3, rWORD4
++	lbzu	rWORD5, 2(rSTR1)
++	lbzu	rWORD6, 2(rSTR2)
+ 	bdz	L(b13)
+ 	.align	4
+ L(bLoop):
+-	lbzu	rWORD1,1(rSTR1)
+-	lbzu	rWORD2,1(rSTR2)
+-	bne	cr0,L(bLcr0)
++	lbzu	rWORD1, 1(rSTR1)
++	lbzu	rWORD2, 1(rSTR2)
++	bne	cr7, L(bLcr7)
+ 
+-	cmplw	cr6,rWORD5,rWORD6
++	cmplw	cr6, rWORD5, rWORD6
+ 	bdz	L(b3i)
+ 
+-	lbzu	rWORD3,1(rSTR1)
+-	lbzu	rWORD4,1(rSTR2)
+-	bne	cr1,L(bLcr1)
++	lbzu	rWORD3, 1(rSTR1)
++	lbzu	rWORD4, 1(rSTR2)
++	bne	cr1, L(bLcr1)
+ 
+-	cmplw	cr0,rWORD1,rWORD2
++	cmplw	cr7, rWORD1, rWORD2
+ 	bdz	L(b2i)
+ 
+-	lbzu	rWORD5,1(rSTR1)
+-	lbzu	rWORD6,1(rSTR2)
+-	bne	cr6,L(bLcr6)
++	lbzu	rWORD5, 1(rSTR1)
++	lbzu	rWORD6, 1(rSTR2)
++	bne	cr6, L(bLcr6)
+ 
+-	cmplw	cr1,rWORD3,rWORD4
++	cmplw	cr1, rWORD3, rWORD4
+ 	bdnz	L(bLoop)
+ 
+ /* We speculatively loading bytes before we have tested the previous
+@@ -454,67 +660,62 @@
+    tested.  In this case we must complete the pending operations
+    before returning.  */
+ L(b1i):
+-	bne	cr0,L(bLcr0)
+-	bne	cr1,L(bLcr1)
++	bne	cr7, L(bLcr7)
++	bne	cr1, L(bLcr1)
+ 	b	L(bx56)
+ 	.align	4
+ L(b2i):
+-	bne	cr6,L(bLcr6)
+-	bne	cr0,L(bLcr0)
++	bne	cr6, L(bLcr6)
++	bne	cr7, L(bLcr7)
+ 	b	L(bx34)
+ 	.align	4
+ L(b3i):
+-	bne	cr1,L(bLcr1)
+-	bne	cr6,L(bLcr6)
++	bne	cr1, L(bLcr1)
++	bne	cr6, L(bLcr6)
+ 	b	L(bx12)
+ 	.align	4
+-L(bLcr0):
+-	li	rRTN,1
+-	bgtlr	cr0
+-	li	rRTN,-1
++L(bLcr7):
++	li	rRTN, 1
++	bgtlr	cr7
++	li	rRTN, -1
+ 	blr
+ L(bLcr1):
+-	li	rRTN,1
++	li	rRTN, 1
+ 	bgtlr	cr1
+-	li	rRTN,-1
++	li	rRTN, -1
+ 	blr
+ L(bLcr6):
+-	li	rRTN,1
++	li	rRTN, 1
+ 	bgtlr	cr6
+-	li	rRTN,-1
++	li	rRTN, -1
+ 	blr
+ 
+ L(b13):
+-	bne	cr0,L(bx12)
+-	bne	cr1,L(bx34)
++	bne	cr7, L(bx12)
++	bne	cr1, L(bx34)
+ L(bx56):
+-	sub	rRTN,rWORD5,rWORD6
++	sub	rRTN, rWORD5, rWORD6
+ 	blr
+ 	nop
+ L(b12):
+-	bne	cr0,L(bx12)
++	bne	cr7, L(bx12)
+ L(bx34):
+-	sub	rRTN,rWORD3,rWORD4
++	sub	rRTN, rWORD3, rWORD4
+ 	blr
+-
+ L(b11):
+ L(bx12):
+-	sub	rRTN,rWORD1,rWORD2
++	sub	rRTN, rWORD1, rWORD2
+ 	blr
+-
+ 	.align	4
+-L(zeroLengthReturn):
+-
+ L(zeroLength):
+-	li	rRTN,0
++	li	rRTN, 0
+ 	blr
+ 
+-	cfi_adjust_cfa_offset(64)
+ 	.align	4
+ /* At this point we know the strings have different alignment and the
+-   compare length is at least 8 bytes.  rBITDIF contains the low order
++   compare length is at least 8 bytes.  r12 contains the low order
+    2 bits of rSTR1 and cr5 contains the result of the logical compare
+-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is word aligned and can
++   of r12 to 0.  If r12 == 0 then rStr1 is word aligned and can
+    perform the Wunaligned loop.
+ 
+    Otherwise we know that rSTR1 is not aready word aligned yet.
+@@ -523,465 +724,654 @@
+    eliminate bits preceeding the first byte.  Since we want to join the
+    normal (Wualigned) compare loop, starting at the second word,
+    we need to adjust the length (rN) and special case the loop
+-   versioning for the first W. This insures that the loop count is
++   versioning for the first W. This ensures that the loop count is
+    correct and the first W (shifted) is in the expected resister pair.  */
+ #define rSHL		r29	/* Unaligned shift left count.  */
+ #define rSHR		r28	/* Unaligned shift right count.  */
+-#define rB		r27	/* Left rotation temp for rWORD2.  */
+-#define rD		r26	/* Left rotation temp for rWORD4.  */
+-#define rF		r25	/* Left rotation temp for rWORD6.  */
+-#define rH		r24	/* Left rotation temp for rWORD8.  */
+-#define rA		r0	/* Right rotation temp for rWORD2.  */
+-#define rC		r12	/* Right rotation temp for rWORD4.  */
+-#define rE		r0	/* Right rotation temp for rWORD6.  */
+-#define rG		r12	/* Right rotation temp for rWORD8.  */
++#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
++#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
++#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
++#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
++	cfi_adjust_cfa_offset(64)
+ L(unaligned):
+-	stw	r29,40(r1)
+-	cfi_offset(r29,(40-64))
+-	clrlwi	rSHL,rSTR2,30
+-	stw	r28,36(r1)
+-	cfi_offset(r28,(36-64))
+-	beq	cr5,L(Wunaligned)
+-	stw	r27,32(r1)
+-	cfi_offset(r27,(32-64))
++	stw	rSHL, 40(r1)
++	cfi_offset(rSHL, (40-64))
++	clrlwi	rSHL, rSTR2, 30
++	stw	rSHR, 36(r1)
++	cfi_offset(rSHR, (36-64))
++	beq	cr5, L(Wunaligned)
++	stw	rWORD8_SHIFT, 32(r1)
++	cfi_offset(rWORD8_SHIFT, (32-64))
+ /* Adjust the logical start of rSTR2 to compensate for the extra bits
+    in the 1st rSTR1 W.  */
+-	sub	r27,rSTR2,rBITDIF
++	sub	rWORD8_SHIFT, rSTR2, r12
+ /* But do not attempt to address the W before that W that contains
+    the actual start of rSTR2.  */
+-	clrrwi	rSTR2,rSTR2,2
+-	stw	r26,28(r1)
+-	cfi_offset(r26,(28-64))
+-/* Compute the left/right shift counts for the unalign rSTR2,
++	clrrwi	rSTR2, rSTR2, 2
++	stw	rWORD2_SHIFT, 28(r1)
++	cfi_offset(rWORD2_SHIFT, (28-64))
++/* Compute the left/right shift counts for the unaligned rSTR2,
+    compensating for the logical (W aligned) start of rSTR1.  */
+-	clrlwi	rSHL,r27,30
+-	clrrwi	rSTR1,rSTR1,2
+-	stw	r25,24(r1)
+-	cfi_offset(r25,(24-64))
+-	slwi	rSHL,rSHL,3
+-	cmplw	cr5,r27,rSTR2
+-	add	rN,rN,rBITDIF
+-	slwi	r11,rBITDIF,3
+-	stw	r24,20(r1)
+-	cfi_offset(r24,(20-64))
+-	subfic	rSHR,rSHL,32
+-	srwi	rTMP,rN,4	/* Divide by 16 */
+-	andi.	rBITDIF,rN,12	/* Get the W remainder */
++	clrlwi	rSHL, rWORD8_SHIFT, 30
++	clrrwi	rSTR1, rSTR1, 2
++	stw	rWORD4_SHIFT, 24(r1)
++	cfi_offset(rWORD4_SHIFT, (24-64))
++	slwi	rSHL, rSHL, 3
++	cmplw	cr5, rWORD8_SHIFT, rSTR2
++	add	rN, rN, r12
++	slwi	rWORD6, r12, 3
++	stw	rWORD6_SHIFT, 20(r1)
++	cfi_offset(rWORD6_SHIFT, (20-64))
++	subfic	rSHR, rSHL, 32
++	srwi	r0, rN, 4	/* Divide by 16 */
++	andi.	r12, rN, 12	/* Get the W remainder */
+ /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
+    this special case those bits may be discarded anyway.  Also we
+    must avoid loading a W where none of the bits are part of rSTR2 as
+    this may cross a page boundary and cause a page fault.  */
+-	li	rWORD8,0
+-	blt	cr5,L(dus0)
+-	lwz	rWORD8,0(rSTR2)
+-	la	rSTR2,4(rSTR2)
+-	slw	rWORD8,rWORD8,rSHL
++	li	rWORD8, 0
++	blt	cr5, L(dus0)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD8, 0(rSTR2)
++	addi	rSTR2, rSTR2, 4
++#endif
++	slw	rWORD8, rWORD8, rSHL
+ 
+ L(dus0):
+-	lwz	rWORD1,0(rSTR1)
+-	lwz	rWORD2,0(rSTR2)
+-	cmplwi	cr1,rBITDIF,8
+-	cmplwi	cr7,rN,16
+-	srw	rG,rWORD2,rSHR
+-	clrlwi	rN,rN,30
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 0(rSTR1)
++	lwz	rWORD2, 0(rSTR2)
++#endif
++	cmplwi	cr1, r12, 8
++	cmplwi	cr7, rN, 16
++	srw	r12, rWORD2, rSHR
++	clrlwi	rN, rN, 30
+ 	beq	L(duPs4)
+-	mtctr	rTMP
+-	or	rWORD8,rG,rWORD8
+-	bgt	cr1,L(duPs3)
+-	beq	cr1,L(duPs2)
++	mtctr	r0
++	or	rWORD8, r12, rWORD8
++	bgt	cr1, L(duPs3)
++	beq	cr1, L(duPs2)
+ 
+ /* Remainder is 4 */
+ 	.align	4
+ L(dusP1):
+-	slw	rB,rWORD2,rSHL
+-	slw	rWORD7,rWORD1,r11
+-	slw	rWORD8,rWORD8,r11
+-	bge	cr7,L(duP1e)
++	slw	rWORD8_SHIFT, rWORD2, rSHL
++	slw	rWORD7, rWORD1, rWORD6
++	slw	rWORD8, rWORD8, rWORD6
++	bge	cr7, L(duP1e)
+ /* At this point we exit early with the first word compare
+    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
+    how we handle the remaining bytes.  */
+-	cmplw	cr5,rWORD7,rWORD8
+-	slwi.	rN,rN,3
+-	bne	cr5,L(duLcr5)
+-	cmplw	cr7,rN,rSHR
++	cmplw	cr5, rWORD7, rWORD8
++	slwi.	rN, rN, 3
++	bne	cr5, L(duLcr5)
++	cmplw	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA,0
+-	ble	cr7,L(dutrim)
+-	lwz	rWORD2,4(rSTR2)
+-	srw	rA,rWORD2,rSHR
++	li	r0, 0
++	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD2, 4(rSTR2)
++#endif
++	srw	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ /* Remainder is 8 */
+ 	.align	4
+ L(duPs2):
+-	slw	rH,rWORD2,rSHL
+-	slw	rWORD5,rWORD1,r11
+-	slw	rWORD6,rWORD8,r11
++	slw	rWORD6_SHIFT, rWORD2, rSHL
++	slw	rWORD5, rWORD1, rWORD6
++	slw	rWORD6, rWORD8, rWORD6
+ 	b	L(duP2e)
+ /* Remainder is 12 */
+ 	.align	4
+ L(duPs3):
+-	slw	rF,rWORD2,rSHL
+-	slw	rWORD3,rWORD1,r11
+-	slw	rWORD4,rWORD8,r11
++	slw	rWORD4_SHIFT, rWORD2, rSHL
++	slw	rWORD3, rWORD1, rWORD6
++	slw	rWORD4, rWORD8, rWORD6
+ 	b	L(duP3e)
+ /* Count is a multiple of 16, remainder is 0 */
+ 	.align	4
+ L(duPs4):
+-	mtctr	rTMP
+-	or	rWORD8,rG,rWORD8
+-	slw	rD,rWORD2,rSHL
+-	slw	rWORD1,rWORD1,r11
+-	slw	rWORD2,rWORD8,r11
++	mtctr	r0
++	or	rWORD8, r12, rWORD8
++	slw	rWORD2_SHIFT, rWORD2, rSHL
++	slw	rWORD1, rWORD1, rWORD6
++	slw	rWORD2, rWORD8, rWORD6
+ 	b	L(duP4e)
+ 
+ /* At this point we know rSTR1 is word aligned and the
+    compare length is at least 8 bytes.  */
+ 	.align	4
+ L(Wunaligned):
+-	stw	r27,32(r1)
+-	cfi_offset(r27,(32-64))
+-	clrrwi	rSTR2,rSTR2,2
+-	stw	r26,28(r1)
+-	cfi_offset(r26,(28-64))
+-	srwi	rTMP,rN,4	/* Divide by 16 */
+-	stw	r25,24(r1)
+-	cfi_offset(r25,(24-64))
+-	andi.	rBITDIF,rN,12	/* Get the W remainder */
+-	stw	r24,20(r1)
+-	cfi_offset(r24,(24-64))
+-	slwi	rSHL,rSHL,3
+-	lwz	rWORD6,0(rSTR2)
+-	lwzu	rWORD8,4(rSTR2)
+-	cmplwi	cr1,rBITDIF,8
+-	cmplwi	cr7,rN,16
+-	clrlwi	rN,rN,30
+-	subfic	rSHR,rSHL,32
+-	slw	rH,rWORD6,rSHL
++	stw	rWORD8_SHIFT, 32(r1)
++	cfi_offset(rWORD8_SHIFT, (32-64))
++	clrrwi	rSTR2, rSTR2, 2
++	stw	rWORD2_SHIFT, 28(r1)
++	cfi_offset(rWORD2_SHIFT, (28-64))
++	srwi	r0, rN, 4	/* Divide by 16 */
++	stw	rWORD4_SHIFT, 24(r1)
++	cfi_offset(rWORD4_SHIFT, (24-64))
++	andi.	r12, rN, 12	/* Get the W remainder */
++	stw	rWORD6_SHIFT, 20(r1)
++	cfi_offset(rWORD6_SHIFT, (20-64))
++	slwi	rSHL, rSHL, 3
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD6, 0(rSTR2)
++	lwzu	rWORD8, 4(rSTR2)
++#endif
++	cmplwi	cr1, r12, 8
++	cmplwi	cr7, rN, 16
++	clrlwi	rN, rN, 30
++	subfic	rSHR, rSHL, 32
++	slw	rWORD6_SHIFT, rWORD6, rSHL
+ 	beq	L(duP4)
+-	mtctr	rTMP
+-	bgt	cr1,L(duP3)
+-	beq	cr1,L(duP2)
++	mtctr	r0
++	bgt	cr1, L(duP3)
++	beq	cr1, L(duP2)
+ 
+ /* Remainder is 4 */
+ 	.align	4
+ L(duP1):
+-	srw	rG,rWORD8,rSHR
+-	lwz	rWORD7,0(rSTR1)
+-	slw	rB,rWORD8,rSHL
+-	or	rWORD8,rG,rH
+-	blt	cr7,L(duP1x)
++	srw	r12, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	addi	rSTR1, rSTR1, 4
++#else
++	lwz	rWORD7, 0(rSTR1)
++#endif
++	slw	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
++	blt	cr7, L(duP1x)
+ L(duP1e):
+-	lwz	rWORD1,4(rSTR1)
+-	lwz	rWORD2,4(rSTR2)
+-	cmplw	cr5,rWORD7,rWORD8
+-	srw	rA,rWORD2,rSHR
+-	slw	rD,rWORD2,rSHL
+-	or	rWORD2,rA,rB
+-	lwz	rWORD3,8(rSTR1)
+-	lwz	rWORD4,8(rSTR2)
+-	cmplw	cr0,rWORD1,rWORD2
+-	srw	rC,rWORD4,rSHR
+-	slw	rF,rWORD4,rSHL
+-	bne	cr5,L(duLcr5)
+-	or	rWORD4,rC,rD
+-	lwz	rWORD5,12(rSTR1)
+-	lwz	rWORD6,12(rSTR2)
+-	cmplw	cr1,rWORD3,rWORD4
+-	srw	rE,rWORD6,rSHR
+-	slw	rH,rWORD6,rSHL
+-	bne	cr0,L(duLcr0)
+-	or	rWORD6,rE,rF
+-	cmplw	cr6,rWORD5,rWORD6
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 4(rSTR1)
++	lwz	rWORD2, 4(rSTR2)
++#endif
++	cmplw	cr5, rWORD7, rWORD8
++	srw	r0, rWORD2, rSHR
++	slw	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD3, 8(rSTR1)
++	lwz	rWORD4, 8(rSTR2)
++#endif
++	cmplw	cr7, rWORD1, rWORD2
++	srw	r12, rWORD4, rSHR
++	slw	rWORD4_SHIFT, rWORD4, rSHL
++	bne	cr5, L(duLcr5)
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD5, 12(rSTR1)
++	lwz	rWORD6, 12(rSTR2)
++#endif
++	cmplw	cr1, rWORD3, rWORD4
++	srw	r0, rWORD6, rSHR
++	slw	rWORD6_SHIFT, rWORD6, rSHL
++	bne	cr7, L(duLcr7)
++	or	rWORD6, r0, rWORD4_SHIFT
++	cmplw	cr6, rWORD5, rWORD6
+ 	b	L(duLoop3)
+ 	.align	4
+ /* At this point we exit early with the first word compare
+    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
+    how we handle the remaining bytes.  */
+ L(duP1x):
+-	cmplw	cr5,rWORD7,rWORD8
+-	slwi.	rN,rN,3
+-	bne	cr5,L(duLcr5)
+-	cmplw	cr7,rN,rSHR
++	cmplw	cr5, rWORD7, rWORD8
++	slwi.	rN, rN, 3
++	bne	cr5, L(duLcr5)
++	cmplw	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA,0
+-	ble	cr7,L(dutrim)
+-	ld	rWORD2,8(rSTR2)
+-	srw	rA,rWORD2,rSHR
++	li	r0, 0
++	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD2, 8(rSTR2)
++#endif
++	srw	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ /* Remainder is 8 */
+ 	.align	4
+ L(duP2):
+-	srw	rE,rWORD8,rSHR
+-	lwz	rWORD5,0(rSTR1)
+-	or	rWORD6,rE,rH
+-	slw	rH,rWORD8,rSHL
++	srw	r0, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	addi	rSTR1, rSTR1, 4
++#else
++	lwz	rWORD5, 0(rSTR1)
++#endif
++	or	rWORD6, r0, rWORD6_SHIFT
++	slw	rWORD6_SHIFT, rWORD8, rSHL
+ L(duP2e):
+-	lwz	rWORD7,4(rSTR1)
+-	lwz	rWORD8,4(rSTR2)
+-	cmplw	cr6,rWORD5,rWORD6
+-	srw	rG,rWORD8,rSHR
+-	slw	rB,rWORD8,rSHL
+-	or	rWORD8,rG,rH
+-	blt	cr7,L(duP2x)
+-	lwz	rWORD1,8(rSTR1)
+-	lwz	rWORD2,8(rSTR2)
+-	cmplw	cr5,rWORD7,rWORD8
+-	bne	cr6,L(duLcr6)
+-	srw	rA,rWORD2,rSHR
+-	slw	rD,rWORD2,rSHL
+-	or	rWORD2,rA,rB
+-	lwz	rWORD3,12(rSTR1)
+-	lwz	rWORD4,12(rSTR2)
+-	cmplw	cr0,rWORD1,rWORD2
+-	bne	cr5,L(duLcr5)
+-	srw	rC,rWORD4,rSHR
+-	slw	rF,rWORD4,rSHL
+-	or	rWORD4,rC,rD
+-	addi	rSTR1,rSTR1,4
+-	addi	rSTR2,rSTR2,4
+-	cmplw	cr1,rWORD3,rWORD4
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD7, 4(rSTR1)
++	lwz	rWORD8, 4(rSTR2)
++#endif
++	cmplw	cr6, rWORD5, rWORD6
++	srw	r12, rWORD8, rSHR
++	slw	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
++	blt	cr7, L(duP2x)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 8(rSTR1)
++	lwz	rWORD2, 8(rSTR2)
++#endif
++	cmplw	cr5, rWORD7, rWORD8
++	bne	cr6, L(duLcr6)
++	srw	r0, rWORD2, rSHR
++	slw	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD3, 12(rSTR1)
++	lwz	rWORD4, 12(rSTR2)
++#endif
++	cmplw	cr7, rWORD1, rWORD2
++	bne	cr5, L(duLcr5)
++	srw	r12, rWORD4, rSHR
++	slw	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#endif
++	cmplw	cr1, rWORD3, rWORD4
+ 	b	L(duLoop2)
+ 	.align	4
+ L(duP2x):
+-	cmplw	cr5,rWORD7,rWORD8
+-	addi	rSTR1,rSTR1,4
+-	addi	rSTR2,rSTR2,4
+-	bne	cr6,L(duLcr6)
+-	slwi.	rN,rN,3
+-	bne	cr5,L(duLcr5)
+-	cmplw	cr7,rN,rSHR
++	cmplw	cr5, rWORD7, rWORD8
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#endif
++	bne	cr6, L(duLcr6)
++	slwi.	rN, rN, 3
++	bne	cr5, L(duLcr5)
++	cmplw	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA,0
+-	ble	cr7,L(dutrim)
+-	lwz	rWORD2,4(rSTR2)
+-	srw	rA,rWORD2,rSHR
++	li	r0, 0
++	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD2, 4(rSTR2)
++#endif
++	srw	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ 
+ /* Remainder is 12 */
+ 	.align	4
+ L(duP3):
+-	srw	rC,rWORD8,rSHR
+-	lwz	rWORD3,0(rSTR1)
+-	slw	rF,rWORD8,rSHL
+-	or	rWORD4,rC,rH
++	srw	r12, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	addi	rSTR1, rSTR1, 4
++#else
++	lwz	rWORD3, 0(rSTR1)
++#endif
++	slw	rWORD4_SHIFT, rWORD8, rSHL
++	or	rWORD4, r12, rWORD6_SHIFT
+ L(duP3e):
+-	lwz	rWORD5,4(rSTR1)
+-	lwz	rWORD6,4(rSTR2)
+-	cmplw	cr1,rWORD3,rWORD4
+-	srw	rE,rWORD6,rSHR
+-	slw	rH,rWORD6,rSHL
+-	or	rWORD6,rE,rF
+-	lwz	rWORD7,8(rSTR1)
+-	lwz	rWORD8,8(rSTR2)
+-	cmplw	cr6,rWORD5,rWORD6
+-	bne	cr1,L(duLcr1)
+-	srw	rG,rWORD8,rSHR
+-	slw	rB,rWORD8,rSHL
+-	or	rWORD8,rG,rH
+-	blt	cr7,L(duP3x)
+-	lwz	rWORD1,12(rSTR1)
+-	lwz	rWORD2,12(rSTR2)
+-	cmplw	cr5,rWORD7,rWORD8
+-	bne	cr6,L(duLcr6)
+-	srw	rA,rWORD2,rSHR
+-	slw	rD,rWORD2,rSHL
+-	or	rWORD2,rA,rB
+-	addi	rSTR1,rSTR1,8
+-	addi	rSTR2,rSTR2,8
+-	cmplw	cr0,rWORD1,rWORD2
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD5, 4(rSTR1)
++	lwz	rWORD6, 4(rSTR2)
++#endif
++	cmplw	cr1, rWORD3, rWORD4
++	srw	r0, rWORD6, rSHR
++	slw	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD7, 8(rSTR1)
++	lwz	rWORD8, 8(rSTR2)
++#endif
++	cmplw	cr6, rWORD5, rWORD6
++	bne	cr1, L(duLcr1)
++	srw	r12, rWORD8, rSHR
++	slw	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
++	blt	cr7, L(duP3x)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 12(rSTR1)
++	lwz	rWORD2, 12(rSTR2)
++#endif
++	cmplw	cr5, rWORD7, rWORD8
++	bne	cr6, L(duLcr6)
++	srw	r0, rWORD2, rSHR
++	slw	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#endif
++	cmplw	cr7, rWORD1, rWORD2
+ 	b	L(duLoop1)
+ 	.align	4
+ L(duP3x):
+-	addi	rSTR1,rSTR1,8
+-	addi	rSTR2,rSTR2,8
+-	bne	cr1,L(duLcr1)
+-	cmplw	cr5,rWORD7,rWORD8
+-	bne	cr6,L(duLcr6)
+-	slwi.	rN,rN,3
+-	bne	cr5,L(duLcr5)
+-	cmplw	cr7,rN,rSHR
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#endif
++#if 0
++/* Huh?  We've already branched on cr1!  */
++	bne	cr1, L(duLcr1)
++#endif
++	cmplw	cr5, rWORD7, rWORD8
++	bne	cr6, L(duLcr6)
++	slwi.	rN, rN, 3
++	bne	cr5, L(duLcr5)
++	cmplw	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA,0
+-	ble	cr7,L(dutrim)
+-	lwz	rWORD2,4(rSTR2)
+-	srw	rA,rWORD2,rSHR
++	li	r0, 0
++	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD2, 4(rSTR2)
++#endif
++	srw	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ 
+ /* Count is a multiple of 16, remainder is 0 */
+ 	.align	4
+ L(duP4):
+-	mtctr	rTMP
+-	srw	rA,rWORD8,rSHR
+-	lwz	rWORD1,0(rSTR1)
+-	slw	rD,rWORD8,rSHL
+-	or	rWORD2,rA,rH
++	mtctr	r0
++	srw	r0, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	addi	rSTR1, rSTR1, 4
++#else
++	lwz	rWORD1, 0(rSTR1)
++#endif
++	slw	rWORD2_SHIFT, rWORD8, rSHL
++	or	rWORD2, r0, rWORD6_SHIFT
+ L(duP4e):
+-	lwz	rWORD3,4(rSTR1)
+-	lwz	rWORD4,4(rSTR2)
+-	cmplw	cr0,rWORD1,rWORD2
+-	srw	rC,rWORD4,rSHR
+-	slw	rF,rWORD4,rSHL
+-	or	rWORD4,rC,rD
+-	lwz	rWORD5,8(rSTR1)
+-	lwz	rWORD6,8(rSTR2)
+-	cmplw	cr1,rWORD3,rWORD4
+-	bne	cr0,L(duLcr0)
+-	srw	rE,rWORD6,rSHR
+-	slw	rH,rWORD6,rSHL
+-	or	rWORD6,rE,rF
+-	lwzu	rWORD7,12(rSTR1)
+-	lwzu	rWORD8,12(rSTR2)
+-	cmplw	cr6,rWORD5,rWORD6
+-	bne	cr1,L(duLcr1)
+-	srw	rG,rWORD8,rSHR
+-	slw	rB,rWORD8,rSHL
+-	or	rWORD8,rG,rH
+-	cmplw	cr5,rWORD7,rWORD8
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD3, 4(rSTR1)
++	lwz	rWORD4, 4(rSTR2)
++#endif
++	cmplw	cr7, rWORD1, rWORD2
++	srw	r12, rWORD4, rSHR
++	slw	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD5, 8(rSTR1)
++	lwz	rWORD6, 8(rSTR2)
++#endif
++	cmplw	cr1, rWORD3, rWORD4
++	bne	cr7, L(duLcr7)
++	srw	r0, rWORD6, rSHR
++	slw	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwzu	rWORD7, 12(rSTR1)
++	lwzu	rWORD8, 12(rSTR2)
++#endif
++	cmplw	cr6, rWORD5, rWORD6
++	bne	cr1, L(duLcr1)
++	srw	r12, rWORD8, rSHR
++	slw	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
++	cmplw	cr5, rWORD7, rWORD8
+ 	bdz	L(du24)		/* Adjust CTR as we start with +4 */
+ /* This is the primary loop */
+ 	.align	4
+ L(duLoop):
+-	lwz	rWORD1,4(rSTR1)
+-	lwz	rWORD2,4(rSTR2)
+-	cmplw	cr1,rWORD3,rWORD4
+-	bne	cr6,L(duLcr6)
+-	srw	rA,rWORD2,rSHR
+-	slw	rD,rWORD2,rSHL
+-	or	rWORD2,rA,rB
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD1, 4(rSTR1)
++	lwz	rWORD2, 4(rSTR2)
++#endif
++	cmplw	cr1, rWORD3, rWORD4
++	bne	cr6, L(duLcr6)
++	srw	r0, rWORD2, rSHR
++	slw	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
+ L(duLoop1):
+-	lwz	rWORD3,8(rSTR1)
+-	lwz	rWORD4,8(rSTR2)
+-	cmplw	cr6,rWORD5,rWORD6
+-	bne	cr5,L(duLcr5)
+-	srw	rC,rWORD4,rSHR
+-	slw	rF,rWORD4,rSHL
+-	or	rWORD4,rC,rD
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD3, 0, rSTR1
++	lwbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD3, 8(rSTR1)
++	lwz	rWORD4, 8(rSTR2)
++#endif
++	cmplw	cr6, rWORD5, rWORD6
++	bne	cr5, L(duLcr5)
++	srw	r12, rWORD4, rSHR
++	slw	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
+ L(duLoop2):
+-	lwz	rWORD5,12(rSTR1)
+-	lwz	rWORD6,12(rSTR2)
+-	cmplw	cr5,rWORD7,rWORD8
+-	bne	cr0,L(duLcr0)
+-	srw	rE,rWORD6,rSHR
+-	slw	rH,rWORD6,rSHL
+-	or	rWORD6,rE,rF
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD5, 0, rSTR1
++	lwbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD5, 12(rSTR1)
++	lwz	rWORD6, 12(rSTR2)
++#endif
++	cmplw	cr5, rWORD7, rWORD8
++	bne	cr7, L(duLcr7)
++	srw	r0, rWORD6, rSHR
++	slw	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
+ L(duLoop3):
+-	lwzu	rWORD7,16(rSTR1)
+-	lwzu	rWORD8,16(rSTR2)
+-	cmplw	cr0,rWORD1,rWORD2
+-	bne	cr1,L(duLcr1)
+-	srw	rG,rWORD8,rSHR
+-	slw	rB,rWORD8,rSHL
+-	or	rWORD8,rG,rH
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD7, 0, rSTR1
++	lwbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 4
++	addi	rSTR2, rSTR2, 4
++#else
++	lwzu	rWORD7, 16(rSTR1)
++	lwzu	rWORD8, 16(rSTR2)
++#endif
++	cmplw	cr7, rWORD1, rWORD2
++	bne	cr1, L(duLcr1)
++	srw	r12, rWORD8, rSHR
++	slw	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
+ 	bdnz	L(duLoop)
+ 
+ L(duL4):
+-	bne	cr1,L(duLcr1)
+-	cmplw	cr1,rWORD3,rWORD4
+-	bne	cr6,L(duLcr6)
+-	cmplw	cr6,rWORD5,rWORD6
+-	bne	cr5,L(duLcr5)
+-	cmplw	cr5,rWORD7,rWORD8
++#if 0
++/* Huh?  We've already branched on cr1!  */
++	bne	cr1, L(duLcr1)
++#endif
++	cmplw	cr1, rWORD3, rWORD4
++	bne	cr6, L(duLcr6)
++	cmplw	cr6, rWORD5, rWORD6
++	bne	cr5, L(duLcr5)
++	cmplw	cr5, rWORD7, rWORD8
+ L(du44):
+-	bne	cr0,L(duLcr0)
++	bne	cr7, L(duLcr7)
+ L(du34):
+-	bne	cr1,L(duLcr1)
++	bne	cr1, L(duLcr1)
+ L(du24):
+-	bne	cr6,L(duLcr6)
++	bne	cr6, L(duLcr6)
+ L(du14):
+-	slwi.	rN,rN,3
+-	bne	cr5,L(duLcr5)
++	slwi.	rN, rN, 3
++	bne	cr5, L(duLcr5)
+ /* At this point we have a remainder of 1 to 3 bytes to compare.  We use
+    shift right to eliminate bits beyond the compare length.
++   This allows the use of word subtract to compute the final result.
+ 
+    However it may not be safe to load rWORD2 which may be beyond the
+    string length. So we compare the bit length of the remainder to
+    the right shift count (rSHR). If the bit count is less than or equal
+    we do not need to load rWORD2 (all significant bits are already in
+-   rB).  */
+-	cmplw	cr7,rN,rSHR
++   rWORD8_SHIFT).  */
++	cmplw	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA,0
+-	ble	cr7,L(dutrim)
+-	lwz	rWORD2,4(rSTR2)
+-	srw	rA,rWORD2,rSHR
++	li	r0, 0
++	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 4
++#else
++	lwz	rWORD2, 4(rSTR2)
++#endif
++	srw	r0, rWORD2, rSHR
+ 	.align	4
+ L(dutrim):
+-	lwz	rWORD1,4(rSTR1)
+-	lwz	r31,48(1)
+-	subfic	rN,rN,32	/* Shift count is 32 - (rN * 8).  */
+-	or	rWORD2,rA,rB
+-	lwz	r30,44(1)
+-	lwz	r29,40(r1)
+-	srw	rWORD1,rWORD1,rN
+-	srw	rWORD2,rWORD2,rN
+-	lwz	r28,36(r1)
+-	lwz	r27,32(r1)
+-	cmplw	rWORD1,rWORD2
+-	li	rRTN,0
+-	beq	L(dureturn26)
+-	li	rRTN,1
+-	bgt	L(dureturn26)
+-	li	rRTN,-1
++#ifdef __LITTLE_ENDIAN__
++	lwbrx	rWORD1, 0, rSTR1
++#else
++	lwz	rWORD1, 4(rSTR1)
++#endif
++	lwz	rWORD8, 48(r1)
++	subfic	rN, rN, 32	/* Shift count is 32 - (rN * 8).  */
++	or	rWORD2, r0, rWORD8_SHIFT
++	lwz	rWORD7, 44(r1)
++	lwz	rSHL, 40(r1)
++	srw	rWORD1, rWORD1, rN
++	srw	rWORD2, rWORD2, rN
++	lwz	rSHR, 36(r1)
++	lwz	rWORD8_SHIFT, 32(r1)
++	sub	rRTN, rWORD1, rWORD2
+ 	b	L(dureturn26)
+ 	.align	4
+-L(duLcr0):
+-	lwz	r31,48(1)
+-	lwz	r30,44(1)
+-	li	rRTN,1
+-	bgt	cr0,L(dureturn29)
+-	lwz	r29,40(r1)
+-	lwz	r28,36(r1)
+-	li	rRTN,-1
++L(duLcr7):
++	lwz	rWORD8, 48(r1)
++	lwz	rWORD7, 44(r1)
++	li	rRTN, 1
++	bgt	cr7, L(dureturn29)
++	lwz	rSHL, 40(r1)
++	lwz	rSHR, 36(r1)
++	li	rRTN, -1
+ 	b	L(dureturn27)
+ 	.align	4
+ L(duLcr1):
+-	lwz	r31,48(1)
+-	lwz	r30,44(1)
+-	li	rRTN,1
+-	bgt	cr1,L(dureturn29)
+-	lwz	r29,40(r1)
+-	lwz	r28,36(r1)
+-	li	rRTN,-1
++	lwz	rWORD8, 48(r1)
++	lwz	rWORD7, 44(r1)
++	li	rRTN, 1
++	bgt	cr1, L(dureturn29)
++	lwz	rSHL, 40(r1)
++	lwz	rSHR, 36(r1)
++	li	rRTN, -1
+ 	b	L(dureturn27)
+ 	.align	4
+ L(duLcr6):
+-	lwz	r31,48(1)
+-	lwz	r30,44(1)
+-	li	rRTN,1
+-	bgt	cr6,L(dureturn29)
+-	lwz	r29,40(r1)
+-	lwz	r28,36(r1)
+-	li	rRTN,-1
++	lwz	rWORD8, 48(r1)
++	lwz	rWORD7, 44(r1)
++	li	rRTN, 1
++	bgt	cr6, L(dureturn29)
++	lwz	rSHL, 40(r1)
++	lwz	rSHR, 36(r1)
++	li	rRTN, -1
+ 	b	L(dureturn27)
+ 	.align	4
+ L(duLcr5):
+-	lwz	r31,48(1)
+-	lwz	r30,44(1)
+-	li	rRTN,1
+-	bgt	cr5,L(dureturn29)
+-	lwz	r29,40(r1)
+-	lwz	r28,36(r1)
+-	li	rRTN,-1
++	lwz	rWORD8, 48(r1)
++	lwz	rWORD7, 44(r1)
++	li	rRTN, 1
++	bgt	cr5, L(dureturn29)
++	lwz	rSHL, 40(r1)
++	lwz	rSHR, 36(r1)
++	li	rRTN, -1
+ 	b	L(dureturn27)
+ 	.align	3
+ L(duZeroReturn):
+-	li	rRTN,0
++	li	rRTN, 0
+ 	.align	4
+ L(dureturn):
+-	lwz	r31,48(1)
+-	lwz	r30,44(1)
++	lwz	rWORD8, 48(r1)
++	lwz	rWORD7, 44(r1)
+ L(dureturn29):
+-	lwz	r29,40(r1)
+-	lwz	r28,36(r1)
++	lwz	rSHL, 40(r1)
++	lwz	rSHR, 36(r1)
+ L(dureturn27):
+-	lwz	r27,32(r1)
++	lwz	rWORD8_SHIFT, 32(r1)
+ L(dureturn26):
+-	lwz	r26,28(r1)
++	lwz	rWORD2_SHIFT, 28(r1)
+ L(dureturn25):
+-	lwz	r25,24(r1)
+-	lwz	r24,20(r1)
+-	lwz	1,0(1)
++	lwz	rWORD4_SHIFT, 24(r1)
++	lwz	rWORD6_SHIFT, 20(r1)
++	addi	r1, r1, 64
++	cfi_adjust_cfa_offset(-64)
+ 	blr
+ END (BP_SYM (memcmp))
++
+ libc_hidden_builtin_def (memcmp)
+ weak_alias (memcmp,bcmp)
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcmp.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcmp.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcmp.S	2014-05-28 19:22:37.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power4/memcmp.S	2014-05-29 09:35:25.000000000 -0500
+@@ -1,5 +1,5 @@
+-/* Optimized strcmp implementation for PowerPC64.
+-   Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.
++/* Optimized memcmp implementation for PowerPC64.
++   Copyright (C) 2003-2014 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+@@ -17,307 +17,492 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+-#include <bp-sym.h>
+-#include <bp-asm.h>
+ 
+-/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
++/* int [r3] memcmp (const char *s1 [r3],
++		    const char *s2 [r4],
++		    size_t size [r5])  */
+ 
+ 	.machine power4
+-EALIGN (BP_SYM(memcmp), 4, 0)
++EALIGN (memcmp, 4, 0)
+ 	CALL_MCOUNT 3
+ 
+-#define rTMP	r0
+ #define rRTN	r3
+ #define rSTR1	r3	/* first string arg */
+ #define rSTR2	r4	/* second string arg */
+ #define rN	r5	/* max string length */
+-/* Note:  The Bounded pointer support in this code is broken.  This code
+-   was inherited from PPC32 and that support was never completed.
+-   Current PPC gcc does not support -fbounds-check or -fbounded-pointers.  */
+ #define rWORD1	r6	/* current word in s1 */
+ #define rWORD2	r7	/* current word in s2 */
+ #define rWORD3	r8	/* next word in s1 */
+ #define rWORD4	r9	/* next word in s2 */
+ #define rWORD5	r10	/* next word in s1 */
+ #define rWORD6	r11	/* next word in s2 */
+-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
+ #define rWORD7	r30	/* next word in s1 */
+ #define rWORD8	r31	/* next word in s2 */
+ 
+-	xor	rTMP, rSTR2, rSTR1
++	xor	r0, rSTR2, rSTR1
+ 	cmpldi	cr6, rN, 0
+ 	cmpldi	cr1, rN, 12
+-	clrldi.	rTMP, rTMP, 61
+-	clrldi	rBITDIF, rSTR1, 61
+-	cmpldi	cr5, rBITDIF, 0
++	clrldi.	r0, r0, 61
++	clrldi	r12, rSTR1, 61
++	cmpldi	cr5, r12, 0
+ 	beq-	cr6, L(zeroLength)
+-	dcbt	0,rSTR1
+-	dcbt	0,rSTR2
+-/* If less than 8 bytes or not aligned, use the unalligned
++	dcbt	0, rSTR1
++	dcbt	0, rSTR2
++/* If less than 8 bytes or not aligned, use the unaligned
+    byte loop.  */
+ 	blt	cr1, L(bytealigned)
+-	std	rWORD8,-8(r1)	
+-	cfi_offset(rWORD8,-8)
+-	std	rWORD7,-16(r1)	
+-	cfi_offset(rWORD7,-16)
++	std	rWORD8, -8(r1)
++	cfi_offset(rWORD8, -8)
++	std	rWORD7, -16(r1)
++	cfi_offset(rWORD7, -16)
+ 	bne	L(unaligned)
+ /* At this point we know both strings have the same alignment and the
+-   compare length is at least 8 bytes.  rBITDIF containes the low order
++   compare length is at least 8 bytes.  r12 contains the low order
+    3 bits of rSTR1 and cr5 contains the result of the logical compare
+-   of rBITDIF to 0.  If rBITDIF == 0 then we are already double word 
+-   aligned and can perform the DWaligned loop.
+-  
++   of r12 to 0.  If r12 == 0 then we are already double word
++   aligned and can perform the DW aligned loop.
++
+    Otherwise we know the two strings have the same alignment (but not
+-   yet DW).  So we can force the string addresses to the next lower DW
+-   boundary and special case this first DW word using shift left to
+-   ellimiate bits preceeding the first byte.  Since we want to join the
+-   normal (DWaligned) compare loop, starting at the second double word,
++   yet DW).  So we force the string addresses to the next lower DW
++   boundary and special case this first DW using shift left to
++   eliminate bits preceding the first byte.  Since we want to join the
++   normal (DW aligned) compare loop, starting at the second double word,
+    we need to adjust the length (rN) and special case the loop
+-   versioning for the first DW. This insures that the loop count is
+-   correct and the first DW (shifted) is in the expected resister pair.  */
+-	.align 4
++   versioning for the first DW. This ensures that the loop count is
++   correct and the first DW (shifted) is in the expected register pair.  */
++	.align	4
+ L(samealignment):
+ 	clrrdi	rSTR1, rSTR1, 3
+ 	clrrdi	rSTR2, rSTR2, 3
+ 	beq	cr5, L(DWaligned)
+-	add	rN, rN, rBITDIF
+-	sldi	r11, rBITDIF, 3
+-	srdi	rTMP, rN, 5	/* Divide by 32 */
+-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
++	add	rN, rN, r12
++	sldi	rWORD6, r12, 3
++	srdi	r0, rN, 5	/* Divide by 32 */
++	andi.	r12, rN, 24	/* Get the DW remainder */
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 0(rSTR1)
+ 	ld	rWORD2, 0(rSTR2)
+-	cmpldi	cr1, rBITDIF, 16
++#endif
++	cmpldi	cr1, r12, 16
+ 	cmpldi	cr7, rN, 32
+ 	clrldi	rN, rN, 61
+ 	beq	L(dPs4)
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+ 	bgt	cr1, L(dPs3)
+ 	beq	cr1, L(dPs2)
+ 
+ /* Remainder is 8 */
+-	.align 3
++	.align	3
+ L(dsP1):
+-	sld	rWORD5, rWORD1, r11
+-	sld	rWORD6, rWORD2, r11
++	sld	rWORD5, rWORD1, rWORD6
++	sld	rWORD6, rWORD2, rWORD6
+ 	cmpld	cr5, rWORD5, rWORD6
+ 	blt	cr7, L(dP1x)
+ /* Do something useful in this cycle since we have to branch anyway.  */
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 8(rSTR1)
+ 	ld	rWORD2, 8(rSTR2)
+-	cmpld	cr0, rWORD1, rWORD2
++#endif
++	cmpld	cr7, rWORD1, rWORD2
+ 	b	L(dP1e)
+ /* Remainder is 16 */
+-	.align 4
++	.align	4
+ L(dPs2):
+-	sld	rWORD5, rWORD1, r11
+-	sld	rWORD6, rWORD2, r11
++	sld	rWORD5, rWORD1, rWORD6
++	sld	rWORD6, rWORD2, rWORD6
+ 	cmpld	cr6, rWORD5, rWORD6
+ 	blt	cr7, L(dP2x)
+ /* Do something useful in this cycle since we have to branch anyway.  */
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD7, 8(rSTR1)
+ 	ld	rWORD8, 8(rSTR2)
++#endif
+ 	cmpld	cr5, rWORD7, rWORD8
+ 	b	L(dP2e)
+ /* Remainder is 24 */
+-	.align 4
++	.align	4
+ L(dPs3):
+-	sld	rWORD3, rWORD1, r11
+-	sld	rWORD4, rWORD2, r11
++	sld	rWORD3, rWORD1, rWORD6
++	sld	rWORD4, rWORD2, rWORD6
+ 	cmpld	cr1, rWORD3, rWORD4
+ 	b	L(dP3e)
+ /* Count is a multiple of 32, remainder is 0 */
+-	.align 4
++	.align	4
+ L(dPs4):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+-	sld	rWORD1, rWORD1, r11
+-	sld	rWORD2, rWORD2, r11
+-	cmpld	cr0, rWORD1, rWORD2
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++	sld	rWORD1, rWORD1, rWORD6
++	sld	rWORD2, rWORD2, rWORD6
++	cmpld	cr7, rWORD1, rWORD2
+ 	b	L(dP4e)
+ 
+ /* At this point we know both strings are double word aligned and the
+    compare length is at least 8 bytes.  */
+-	.align 4
++	.align	4
+ L(DWaligned):
+-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
+-	srdi	rTMP, rN, 5	/* Divide by 32 */
+-	cmpldi	cr1, rBITDIF, 16
++	andi.	r12, rN, 24	/* Get the DW remainder */
++	srdi	r0, rN, 5	/* Divide by 32 */
++	cmpldi	cr1, r12, 16
+ 	cmpldi	cr7, rN, 32
+ 	clrldi	rN, rN, 61
+ 	beq	L(dP4)
+ 	bgt	cr1, L(dP3)
+ 	beq	cr1, L(dP2)
+-		
++
+ /* Remainder is 8 */
+-	.align 4
++	.align	4
+ L(dP1):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+ /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+-   (8-15 byte compare), we want to use only volitile registers.  This
+-   means we can avoid restoring non-volitile registers since we did not
++   (8-15 byte compare), we want to use only volatile registers.  This
++   means we can avoid restoring non-volatile registers since we did not
+    change any on the early exit path.  The key here is the non-early
+-   exit path only cares about the condition code (cr5), not about which 
++   exit path only cares about the condition code (cr5), not about which
+    register pair was used.  */
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD5, 0(rSTR1)
+ 	ld	rWORD6, 0(rSTR2)
++#endif
+ 	cmpld	cr5, rWORD5, rWORD6
+ 	blt	cr7, L(dP1x)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 8(rSTR1)
+ 	ld	rWORD2, 8(rSTR2)
+-	cmpld	cr0, rWORD1, rWORD2
++#endif
++	cmpld	cr7, rWORD1, rWORD2
+ L(dP1e):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD3, 16(rSTR1)
+ 	ld	rWORD4, 16(rSTR2)
++#endif
+ 	cmpld	cr1, rWORD3, rWORD4
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD5, 24(rSTR1)
+ 	ld	rWORD6, 24(rSTR2)
++#endif
+ 	cmpld	cr6, rWORD5, rWORD6
+-	bne	cr5, L(dLcr5)
+-	bne	cr0, L(dLcr0)
+-	
++	bne	cr5, L(dLcr5x)
++	bne	cr7, L(dLcr7x)
++
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ldu	rWORD7, 32(rSTR1)
+ 	ldu	rWORD8, 32(rSTR2)
++#endif
+ 	bne	cr1, L(dLcr1)
+ 	cmpld	cr5, rWORD7, rWORD8
+ 	bdnz	L(dLoop)
+ 	bne	cr6, L(dLcr6)
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+-	.align 3
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++	.align	3
+ L(dP1x):
+ 	sldi.	r12, rN, 3
+-	bne	cr5, L(dLcr5)
++	bne	cr5, L(dLcr5x)
+ 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+ 	bne	L(d00)
+ 	li	rRTN, 0
+ 	blr
+-		
++
+ /* Remainder is 16 */
+-	.align 4
++	.align	4
+ L(dP2):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD5, 0(rSTR1)
+ 	ld	rWORD6, 0(rSTR2)
++#endif
+ 	cmpld	cr6, rWORD5, rWORD6
+ 	blt	cr7, L(dP2x)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD7, 8(rSTR1)
+ 	ld	rWORD8, 8(rSTR2)
++#endif
+ 	cmpld	cr5, rWORD7, rWORD8
+ L(dP2e):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 16(rSTR1)
+ 	ld	rWORD2, 16(rSTR2)
+-	cmpld	cr0, rWORD1, rWORD2
++#endif
++	cmpld	cr7, rWORD1, rWORD2
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD3, 24(rSTR1)
+ 	ld	rWORD4, 24(rSTR2)
++#endif
+ 	cmpld	cr1, rWORD3, rWORD4
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 8
+ 	addi	rSTR2, rSTR2, 8
++#endif
+ 	bne	cr6, L(dLcr6)
+ 	bne	cr5, L(dLcr5)
+ 	b	L(dLoop2)
+ /* Again we are on a early exit path (16-23 byte compare), we want to
+-   only use volitile registers and avoid restoring non-volitile
++   only use volatile registers and avoid restoring non-volatile
+    registers.  */
+-	.align 4
++	.align	4
+ L(dP2x):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD3, 8(rSTR1)
+ 	ld	rWORD4, 8(rSTR2)
+-	cmpld	cr5, rWORD3, rWORD4
++#endif
++	cmpld	cr1, rWORD3, rWORD4
+ 	sldi.	r12, rN, 3
+-	bne	cr6, L(dLcr6)
++	bne	cr6, L(dLcr6x)
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 8
+ 	addi	rSTR2, rSTR2, 8
+-	bne	cr5, L(dLcr5)
++#endif
++	bne	cr1, L(dLcr1x)
+ 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+ 	bne	L(d00)
+ 	li	rRTN, 0
+ 	blr
+-		
++
+ /* Remainder is 24 */
+-	.align 4
++	.align	4
+ L(dP3):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD3, 0(rSTR1)
+ 	ld	rWORD4, 0(rSTR2)
++#endif
+ 	cmpld	cr1, rWORD3, rWORD4
+ L(dP3e):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD5, 8(rSTR1)
+ 	ld	rWORD6, 8(rSTR2)
++#endif
+ 	cmpld	cr6, rWORD5, rWORD6
+ 	blt	cr7, L(dP3x)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD7, 16(rSTR1)
+ 	ld	rWORD8, 16(rSTR2)
++#endif
+ 	cmpld	cr5, rWORD7, rWORD8
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 24(rSTR1)
+ 	ld	rWORD2, 24(rSTR2)
+-	cmpld	cr0, rWORD1, rWORD2
++#endif
++	cmpld	cr7, rWORD1, rWORD2
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 16
+ 	addi	rSTR2, rSTR2, 16
++#endif
+ 	bne	cr1, L(dLcr1)
+ 	bne	cr6, L(dLcr6)
+ 	b	L(dLoop1)
+ /* Again we are on a early exit path (24-31 byte compare), we want to
+-   only use volitile registers and avoid restoring non-volitile
++   only use volatile registers and avoid restoring non-volatile
+    registers.  */
+-	.align 4
++	.align	4
+ L(dP3x):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 16(rSTR1)
+ 	ld	rWORD2, 16(rSTR2)
+-	cmpld	cr5, rWORD1, rWORD2
++#endif
++	cmpld	cr7, rWORD1, rWORD2
+ 	sldi.	r12, rN, 3
+-	bne	cr1, L(dLcr1)
++	bne	cr1, L(dLcr1x)
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 16
+ 	addi	rSTR2, rSTR2, 16
+-	bne	cr6, L(dLcr6)
++#endif
++	bne	cr6, L(dLcr6x)
+ 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+-	bne	cr5, L(dLcr5)
++	bne	cr7, L(dLcr7x)
+ 	bne	L(d00)
+ 	li	rRTN, 0
+ 	blr
+-	
++
+ /* Count is a multiple of 32, remainder is 0 */
+-	.align 4
++	.align	4
+ L(dP4):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 0(rSTR1)
+ 	ld	rWORD2, 0(rSTR2)
+-	cmpld	cr0, rWORD1, rWORD2
++#endif
++	cmpld	cr7, rWORD1, rWORD2
+ L(dP4e):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD3, 8(rSTR1)
+ 	ld	rWORD4, 8(rSTR2)
++#endif
+ 	cmpld	cr1, rWORD3, rWORD4
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD5, 16(rSTR1)
+ 	ld	rWORD6, 16(rSTR2)
++#endif
+ 	cmpld	cr6, rWORD5, rWORD6
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ldu	rWORD7, 24(rSTR1)
+ 	ldu	rWORD8, 24(rSTR2)
++#endif
+ 	cmpld	cr5, rWORD7, rWORD8
+-	bne	cr0, L(dLcr0)
++	bne	cr7, L(dLcr7)
+ 	bne	cr1, L(dLcr1)
+ 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
+ /* This is the primary loop */
+-	.align 4
++	.align	4
+ L(dLoop):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 8(rSTR1)
+ 	ld	rWORD2, 8(rSTR2)
++#endif
+ 	cmpld	cr1, rWORD3, rWORD4
+ 	bne	cr6, L(dLcr6)
+ L(dLoop1):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD3, 16(rSTR1)
+ 	ld	rWORD4, 16(rSTR2)
++#endif
+ 	cmpld	cr6, rWORD5, rWORD6
+ 	bne	cr5, L(dLcr5)
+ L(dLoop2):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD5, 24(rSTR1)
+ 	ld	rWORD6, 24(rSTR2)
++#endif
+ 	cmpld	cr5, rWORD7, rWORD8
+-	bne	cr0, L(dLcr0)
++	bne	cr7, L(dLcr7)
+ L(dLoop3):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ldu	rWORD7, 32(rSTR1)
+ 	ldu	rWORD8, 32(rSTR2)
++#endif
+ 	bne-	cr1, L(dLcr1)
+-	cmpld	cr0, rWORD1, rWORD2
+-	bdnz+	L(dLoop)	
+-	
++	cmpld	cr7, rWORD1, rWORD2
++	bdnz+	L(dLoop)
++
+ L(dL4):
+ 	cmpld	cr1, rWORD3, rWORD4
+ 	bne	cr6, L(dLcr6)
+@@ -325,84 +510,98 @@
+ 	bne	cr5, L(dLcr5)
+ 	cmpld	cr5, rWORD7, rWORD8
+ L(d44):
+-	bne	cr0, L(dLcr0)
++	bne	cr7, L(dLcr7)
+ L(d34):
+ 	bne	cr1, L(dLcr1)
+ L(d24):
+ 	bne	cr6, L(dLcr6)
+ L(d14):
+ 	sldi.	r12, rN, 3
+-	bne	cr5, L(dLcr5) 
++	bne	cr5, L(dLcr5)
+ L(d04):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
+ 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+ 	beq	L(zeroLength)
+ /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
+    we are aligned it is safe to load the whole double word, and use
+-   shift right double to elliminate bits beyond the compare length.  */ 
++   shift right double to eliminate bits beyond the compare length.  */
+ L(d00):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 8(rSTR1)
+-	ld	rWORD2, 8(rSTR2) 
++	ld	rWORD2, 8(rSTR2)
++#endif
+ 	srd	rWORD1, rWORD1, rN
+ 	srd	rWORD2, rWORD2, rN
+-	cmpld	cr5, rWORD1, rWORD2
+- 	bne	cr5, L(dLcr5x)
++	cmpld	cr7, rWORD1, rWORD2
++	bne	cr7, L(dLcr7x)
+ 	li	rRTN, 0
+ 	blr
+-	.align 4
+-L(dLcr0):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++
++	.align	4
++L(dLcr7):
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++L(dLcr7x):
+ 	li	rRTN, 1
+-	bgtlr	cr0
++	bgtlr	cr7
+ 	li	rRTN, -1
+ 	blr
+-	.align 4
++	.align	4
+ L(dLcr1):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++L(dLcr1x):
+ 	li	rRTN, 1
+ 	bgtlr	cr1
+ 	li	rRTN, -1
+ 	blr
+-	.align 4
++	.align	4
+ L(dLcr6):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++L(dLcr6x):
+ 	li	rRTN, 1
+ 	bgtlr	cr6
+ 	li	rRTN, -1
+ 	blr
+-	.align 4
++	.align	4
+ L(dLcr5):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
+ L(dLcr5x):
+ 	li	rRTN, 1
+ 	bgtlr	cr5
+ 	li	rRTN, -1
+ 	blr
+-	
+-	.align 4
++
++	.align	4
+ L(bytealigned):
+-	mtctr   rN	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	rN	/* Power4 wants mtctr 1st in dispatch group */
++#if 0
++/* Huh?  We've already branched on cr6!  */
+ 	beq-	cr6, L(zeroLength)
++#endif
+ 
+ /* We need to prime this loop.  This loop is swing modulo scheduled
+-   to avoid pipe delays.  The dependent instruction latencies (load to 
++   to avoid pipe delays.  The dependent instruction latencies (load to
+    compare to conditional branch) is 2 to 3 cycles.  In this loop each
+    dispatch group ends in a branch and takes 1 cycle.  Effectively
+-   the first iteration of the loop only serves to load operands and 
+-   branches based on compares are delayed until the next loop. 
++   the first iteration of the loop only serves to load operands and
++   branches based on compares are delayed until the next loop.
+ 
+    So we must precondition some registers and condition codes so that
+    we don't exit the loop early on the first iteration.  */
+-   
++
+ 	lbz	rWORD1, 0(rSTR1)
+ 	lbz	rWORD2, 0(rSTR2)
+ 	bdz-	L(b11)
+-	cmpld	cr0, rWORD1, rWORD2
++	cmpld	cr7, rWORD1, rWORD2
+ 	lbz	rWORD3, 1(rSTR1)
+ 	lbz	rWORD4, 1(rSTR2)
+ 	bdz-	L(b12)
+@@ -410,20 +609,20 @@
+ 	lbzu	rWORD5, 2(rSTR1)
+ 	lbzu	rWORD6, 2(rSTR2)
+ 	bdz-	L(b13)
+-	.align 4
++	.align	4
+ L(bLoop):
+ 	lbzu	rWORD1, 1(rSTR1)
+ 	lbzu	rWORD2, 1(rSTR2)
+-	bne-	cr0, L(bLcr0)
++	bne-	cr7, L(bLcr7)
+ 
+ 	cmpld	cr6, rWORD5, rWORD6
+ 	bdz-	L(b3i)
+-	
++
+ 	lbzu	rWORD3, 1(rSTR1)
+ 	lbzu	rWORD4, 1(rSTR2)
+ 	bne-	cr1, L(bLcr1)
+ 
+-	cmpld	cr0, rWORD1, rWORD2
++	cmpld	cr7, rWORD1, rWORD2
+ 	bdz-	L(b2i)
+ 
+ 	lbzu	rWORD5, 1(rSTR1)
+@@ -432,31 +631,31 @@
+ 
+ 	cmpld	cr1, rWORD3, rWORD4
+ 	bdnz+	L(bLoop)
+-	
++
+ /* We speculatively loading bytes before we have tested the previous
+    bytes.  But we must avoid overrunning the length (in the ctr) to
+-   prevent these speculative loads from causing a segfault.  In this 
++   prevent these speculative loads from causing a segfault.  In this
+    case the loop will exit early (before the all pending bytes are
+    tested.  In this case we must complete the pending operations
+    before returning.  */
+ L(b1i):
+-	bne-	cr0, L(bLcr0)
++	bne-	cr7, L(bLcr7)
+ 	bne-	cr1, L(bLcr1)
+ 	b	L(bx56)
+-	.align 4
++	.align	4
+ L(b2i):
+ 	bne-	cr6, L(bLcr6)
+-	bne-	cr0, L(bLcr0)
++	bne-	cr7, L(bLcr7)
+ 	b	L(bx34)
+-	.align 4
++	.align	4
+ L(b3i):
+ 	bne-	cr1, L(bLcr1)
+ 	bne-	cr6, L(bLcr6)
+ 	b	L(bx12)
+-	.align 4
+-L(bLcr0):
++	.align	4
++L(bLcr7):
+ 	li	rRTN, 1
+-	bgtlr	cr0
++	bgtlr	cr7
+ 	li	rRTN, -1
+ 	blr
+ L(bLcr1):
+@@ -471,116 +670,121 @@
+ 	blr
+ 
+ L(b13):
+-	bne-	cr0, L(bx12)
++	bne-	cr7, L(bx12)
+ 	bne-	cr1, L(bx34)
+ L(bx56):
+ 	sub	rRTN, rWORD5, rWORD6
+ 	blr
+ 	nop
+ L(b12):
+-	bne-	cr0, L(bx12)
+-L(bx34):	
++	bne-	cr7, L(bx12)
++L(bx34):
+ 	sub	rRTN, rWORD3, rWORD4
+ 	blr
+ L(b11):
+ L(bx12):
+ 	sub	rRTN, rWORD1, rWORD2
+ 	blr
+-	.align 4 
+-L(zeroLengthReturn):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++	.align	4
+ L(zeroLength):
+ 	li	rRTN, 0
+ 	blr
+ 
+-	.align 4
++	.align	4
+ /* At this point we know the strings have different alignment and the
+-   compare length is at least 8 bytes.  rBITDIF containes the low order
++   compare length is at least 8 bytes.  r12 contains the low order
+    3 bits of rSTR1 and cr5 contains the result of the logical compare
+-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is double word 
++   of r12 to 0.  If r12 == 0 then rStr1 is double word
+    aligned and can perform the DWunaligned loop.
+-  
+-   Otherwise we know that rSTR1 is not aready DW aligned yet.
++
++   Otherwise we know that rSTR1 is not already DW aligned yet.
+    So we can force the string addresses to the next lower DW
+-   boundary and special case this first DW word using shift left to
+-   ellimiate bits preceeding the first byte.  Since we want to join the
++   boundary and special case this first DW using shift left to
++   eliminate bits preceding the first byte.  Since we want to join the
+    normal (DWaligned) compare loop, starting at the second double word,
+    we need to adjust the length (rN) and special case the loop
+-   versioning for the first DW. This insures that the loop count is
++   versioning for the first DW. This ensures that the loop count is
+    correct and the first DW (shifted) is in the expected resister pair.  */
+-#define rSHL	r29	/* Unaligned shift left count.  */
+-#define rSHR	r28	/* Unaligned shift right count.  */
+-#define rB		r27	/* Left rotation temp for rWORD2.  */
+-#define rD		r26	/* Left rotation temp for rWORD4.  */
+-#define rF		r25	/* Left rotation temp for rWORD6.  */
+-#define rH		r24	/* Left rotation temp for rWORD8.  */
+-#define rA		r0	/* Right rotation temp for rWORD2.  */
+-#define rC		r12	/* Right rotation temp for rWORD4.  */
+-#define rE		r0	/* Right rotation temp for rWORD6.  */
+-#define rG		r12	/* Right rotation temp for rWORD8.  */
++#define rSHL		r29	/* Unaligned shift left count.  */
++#define rSHR		r28	/* Unaligned shift right count.  */
++#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
++#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
++#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
++#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+ L(unaligned):
+-	std	r29,-24(r1)	
+-	cfi_offset(r29,-24)
++	std	rSHL, -24(r1)
++	cfi_offset(rSHL, -24)
+ 	clrldi	rSHL, rSTR2, 61
+ 	beq-	cr6, L(duzeroLength)
+-	std	r28,-32(r1)	
+-	cfi_offset(r28,-32)
++	std	rSHR, -32(r1)
++	cfi_offset(rSHR, -32)
+ 	beq	cr5, L(DWunaligned)
+-	std	r27,-40(r1)	
+-	cfi_offset(r27,-40)
+-/* Adjust the logical start of rSTR2 ro compensate for the extra bits
++	std	rWORD8_SHIFT, -40(r1)
++	cfi_offset(rWORD8_SHIFT, -40)
++/* Adjust the logical start of rSTR2 to compensate for the extra bits
+    in the 1st rSTR1 DW.  */
+-	sub	r27, rSTR2, rBITDIF
++	sub	rWORD8_SHIFT, rSTR2, r12
+ /* But do not attempt to address the DW before that DW that contains
+    the actual start of rSTR2.  */
+ 	clrrdi	rSTR2, rSTR2, 3
+-	std	r26,-48(r1)	
+-	cfi_offset(r26,-48)
+-/* Compute the leaft/right shift counts for the unalign rSTR2,
+-   compensating for the logical (DW aligned) start of rSTR1.  */ 
+-	clrldi	rSHL, r27, 61
+-	clrrdi	rSTR1, rSTR1, 3	
+-	std	r25,-56(r1)	
+-	cfi_offset(r25,-56)
++	std	rWORD2_SHIFT, -48(r1)
++	cfi_offset(rWORD2_SHIFT, -48)
++/* Compute the left/right shift counts for the unaligned rSTR2,
++   compensating for the logical (DW aligned) start of rSTR1.  */
++	clrldi	rSHL, rWORD8_SHIFT, 61
++	clrrdi	rSTR1, rSTR1, 3
++	std	rWORD4_SHIFT, -56(r1)
++	cfi_offset(rWORD4_SHIFT, -56)
+ 	sldi	rSHL, rSHL, 3
+-	cmpld	cr5, r27, rSTR2
+-	add	rN, rN, rBITDIF
+-	sldi	r11, rBITDIF, 3
+-	std	r24,-64(r1)	
+-	cfi_offset(r24,-64)
++	cmpld	cr5, rWORD8_SHIFT, rSTR2
++	add	rN, rN, r12
++	sldi	rWORD6, r12, 3
++	std	rWORD6_SHIFT, -64(r1)
++	cfi_offset(rWORD6_SHIFT, -64)
+ 	subfic	rSHR, rSHL, 64
+-	srdi	rTMP, rN, 5	/* Divide by 32 */
+-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
++	srdi	r0, rN, 5	/* Divide by 32 */
++	andi.	r12, rN, 24	/* Get the DW remainder */
+ /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
+    this special case those bits may be discarded anyway.  Also we
+    must avoid loading a DW where none of the bits are part of rSTR2 as
+    this may cross a page boundary and cause a page fault.  */
+ 	li	rWORD8, 0
+ 	blt	cr5, L(dus0)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD8, 0(rSTR2)
+-	la	rSTR2, 8(rSTR2)
++	addi	rSTR2, rSTR2, 8
++#endif
+ 	sld	rWORD8, rWORD8, rSHL
+ 
+ L(dus0):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 0(rSTR1)
+ 	ld	rWORD2, 0(rSTR2)
+-	cmpldi	cr1, rBITDIF, 16
++#endif
++	cmpldi	cr1, r12, 16
+ 	cmpldi	cr7, rN, 32
+-	srd	rG, rWORD2, rSHR
++	srd	r12, rWORD2, rSHR
+ 	clrldi	rN, rN, 61
+ 	beq	L(duPs4)
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+-	or	rWORD8, rG, rWORD8
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++	or	rWORD8, r12, rWORD8
+ 	bgt	cr1, L(duPs3)
+ 	beq	cr1, L(duPs2)
+ 
+ /* Remainder is 8 */
+-	.align 4
++	.align	4
+ L(dusP1):
+-	sld	rB, rWORD2, rSHL
+-	sld	rWORD7, rWORD1, r11
+-	sld	rWORD8, rWORD8, r11
++	sld	rWORD8_SHIFT, rWORD2, rSHL
++	sld	rWORD7, rWORD1, rWORD6
++	sld	rWORD8, rWORD8, rWORD6
+ 	bge	cr7, L(duP1e)
+ /* At this point we exit early with the first double word compare
+    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+@@ -590,95 +794,133 @@
+ 	bne	cr5, L(duLcr5)
+ 	cmpld	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA, 0
++	li	r0, 0
+ 	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD2, 8(rSTR2)
+-	srd	rA, rWORD2, rSHR
++#endif
++	srd	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ /* Remainder is 16 */
+-	.align 4
++	.align	4
+ L(duPs2):
+-	sld	rH, rWORD2, rSHL
+-	sld	rWORD5, rWORD1, r11
+-	sld	rWORD6, rWORD8, r11
++	sld	rWORD6_SHIFT, rWORD2, rSHL
++	sld	rWORD5, rWORD1, rWORD6
++	sld	rWORD6, rWORD8, rWORD6
+ 	b	L(duP2e)
+ /* Remainder is 24 */
+-	.align 4
++	.align	4
+ L(duPs3):
+-	sld	rF, rWORD2, rSHL
+-	sld	rWORD3, rWORD1, r11
+-	sld	rWORD4, rWORD8, r11
++	sld	rWORD4_SHIFT, rWORD2, rSHL
++	sld	rWORD3, rWORD1, rWORD6
++	sld	rWORD4, rWORD8, rWORD6
+ 	b	L(duP3e)
+ /* Count is a multiple of 32, remainder is 0 */
+-	.align 4
++	.align	4
+ L(duPs4):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+-	or	rWORD8, rG, rWORD8
+-	sld	rD, rWORD2, rSHL
+-	sld	rWORD1, rWORD1, r11
+-	sld	rWORD2, rWORD8, r11
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++	or	rWORD8, r12, rWORD8
++	sld	rWORD2_SHIFT, rWORD2, rSHL
++	sld	rWORD1, rWORD1, rWORD6
++	sld	rWORD2, rWORD8, rWORD6
+ 	b	L(duP4e)
+ 
+ /* At this point we know rSTR1 is double word aligned and the
+    compare length is at least 8 bytes.  */
+-	.align 4
++	.align	4
+ L(DWunaligned):
+-	std	r27,-40(r1)	
+-	cfi_offset(r27,-40)
++	std	rWORD8_SHIFT, -40(r1)
++	cfi_offset(rWORD8_SHIFT, -40)
+ 	clrrdi	rSTR2, rSTR2, 3
+-	std	r26,-48(r1)	
+-	cfi_offset(r26,-48)
+-	srdi	rTMP, rN, 5	/* Divide by 32 */
+-	std	r25,-56(r1)	
+-	cfi_offset(r25,-56)
+-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
+-	std	r24,-64(r1)	
+-	cfi_offset(r24,-64)
++	std	rWORD2_SHIFT, -48(r1)
++	cfi_offset(rWORD2_SHIFT, -48)
++	srdi	r0, rN, 5	/* Divide by 32 */
++	std	rWORD4_SHIFT, -56(r1)
++	cfi_offset(rWORD4_SHIFT, -56)
++	andi.	r12, rN, 24	/* Get the DW remainder */
++	std	rWORD6_SHIFT, -64(r1)
++	cfi_offset(rWORD6_SHIFT, -64)
+ 	sldi	rSHL, rSHL, 3
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD6, 0(rSTR2)
+ 	ldu	rWORD8, 8(rSTR2)
+-	cmpldi	cr1, rBITDIF, 16
++#endif
++	cmpldi	cr1, r12, 16
+ 	cmpldi	cr7, rN, 32
+ 	clrldi	rN, rN, 61
+ 	subfic	rSHR, rSHL, 64
+-	sld	rH, rWORD6, rSHL
++	sld	rWORD6_SHIFT, rWORD6, rSHL
+ 	beq	L(duP4)
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+ 	bgt	cr1, L(duP3)
+ 	beq	cr1, L(duP2)
+-		
++
+ /* Remainder is 8 */
+-	.align 4
++	.align	4
+ L(duP1):
+-	srd	rG, rWORD8, rSHR
++	srd	r12, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	addi	rSTR1, rSTR1, 8
++#else
+ 	ld	rWORD7, 0(rSTR1)
+-	sld	rB, rWORD8, rSHL
+-	or	rWORD8, rG, rH
++#endif
++	sld	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
+ 	blt	cr7, L(duP1x)
+ L(duP1e):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 8(rSTR1)
+ 	ld	rWORD2, 8(rSTR2)
++#endif
+ 	cmpld	cr5, rWORD7, rWORD8
+-	srd	rA, rWORD2, rSHR
+-	sld	rD, rWORD2, rSHL
+-	or	rWORD2, rA, rB
++	srd	r0, rWORD2, rSHR
++	sld	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD3, 16(rSTR1)
+ 	ld	rWORD4, 16(rSTR2)
+-	cmpld	cr0, rWORD1, rWORD2
+-	srd	rC, rWORD4, rSHR
+-	sld	rF, rWORD4, rSHL
++#endif
++	cmpld	cr7, rWORD1, rWORD2
++	srd	r12, rWORD4, rSHR
++	sld	rWORD4_SHIFT, rWORD4, rSHL
+ 	bne	cr5, L(duLcr5)
+-	or	rWORD4, rC, rD
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD5, 24(rSTR1)
+ 	ld	rWORD6, 24(rSTR2)
++#endif
+ 	cmpld	cr1, rWORD3, rWORD4
+-	srd	rE, rWORD6, rSHR
+-	sld	rH, rWORD6, rSHL
+-	bne	cr0, L(duLcr0)
+-	or	rWORD6, rE, rF
++	srd	r0, rWORD6, rSHR
++	sld	rWORD6_SHIFT, rWORD6, rSHL
++	bne	cr7, L(duLcr7)
++	or	rWORD6, r0, rWORD4_SHIFT
+ 	cmpld	cr6, rWORD5, rWORD6
+-	b	L(duLoop3)	
+-	.align 4
++	b	L(duLoop3)
++	.align	4
+ /* At this point we exit early with the first double word compare
+    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+    how we handle the remaining bytes.  */
+@@ -688,186 +930,321 @@
+ 	bne	cr5, L(duLcr5)
+ 	cmpld	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA, 0
++	li	r0, 0
+ 	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD2, 8(rSTR2)
+-	srd	rA, rWORD2, rSHR
++#endif
++	srd	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ /* Remainder is 16 */
+-	.align 4
++	.align	4
+ L(duP2):
+-	srd	rE, rWORD8, rSHR
++	srd	r0, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	addi	rSTR1, rSTR1, 8
++#else
+ 	ld	rWORD5, 0(rSTR1)
+-	or	rWORD6, rE, rH
+-	sld	rH, rWORD8, rSHL
++#endif
++	or	rWORD6, r0, rWORD6_SHIFT
++	sld	rWORD6_SHIFT, rWORD8, rSHL
+ L(duP2e):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD7, 8(rSTR1)
+ 	ld	rWORD8, 8(rSTR2)
++#endif
+ 	cmpld	cr6, rWORD5, rWORD6
+-	srd	rG, rWORD8, rSHR
+-	sld	rB, rWORD8, rSHL
+-	or	rWORD8, rG, rH
++	srd	r12, rWORD8, rSHR
++	sld	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
+ 	blt	cr7, L(duP2x)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 16(rSTR1)
+ 	ld	rWORD2, 16(rSTR2)
++#endif
+ 	cmpld	cr5, rWORD7, rWORD8
+ 	bne	cr6, L(duLcr6)
+-	srd	rA, rWORD2, rSHR
+-	sld	rD, rWORD2, rSHL
+-	or	rWORD2, rA, rB
++	srd	r0, rWORD2, rSHR
++	sld	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD3, 24(rSTR1)
+ 	ld	rWORD4, 24(rSTR2)
+-	cmpld	cr0, rWORD1, rWORD2
++#endif
++	cmpld	cr7, rWORD1, rWORD2
+ 	bne	cr5, L(duLcr5)
+-	srd	rC, rWORD4, rSHR
+-	sld	rF, rWORD4, rSHL
+-	or	rWORD4, rC, rD
++	srd	r12, rWORD4, rSHR
++	sld	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 8
+ 	addi	rSTR2, rSTR2, 8
++#endif
+ 	cmpld	cr1, rWORD3, rWORD4
+ 	b	L(duLoop2)
+-	.align 4
++	.align	4
+ L(duP2x):
+ 	cmpld	cr5, rWORD7, rWORD8
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 8
+ 	addi	rSTR2, rSTR2, 8
++#endif
+ 	bne	cr6, L(duLcr6)
+ 	sldi.	rN, rN, 3
+ 	bne	cr5, L(duLcr5)
+ 	cmpld	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA, 0
++	li	r0, 0
+ 	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD2, 8(rSTR2)
+-	srd	rA, rWORD2, rSHR
++#endif
++	srd	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+-		
++
+ /* Remainder is 24 */
+-	.align 4
++	.align	4
+ L(duP3):
+-	srd	rC, rWORD8, rSHR
++	srd	r12, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	addi	rSTR1, rSTR1, 8
++#else
+ 	ld	rWORD3, 0(rSTR1)
+-	sld	rF, rWORD8, rSHL
+-	or	rWORD4, rC, rH
++#endif
++	sld	rWORD4_SHIFT, rWORD8, rSHL
++	or	rWORD4, r12, rWORD6_SHIFT
+ L(duP3e):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD5, 8(rSTR1)
+ 	ld	rWORD6, 8(rSTR2)
++#endif
+ 	cmpld	cr1, rWORD3, rWORD4
+-	srd	rE, rWORD6, rSHR
+-	sld	rH, rWORD6, rSHL
+-	or	rWORD6, rE, rF
++	srd	r0, rWORD6, rSHR
++	sld	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD7, 16(rSTR1)
+ 	ld	rWORD8, 16(rSTR2)
++#endif
+ 	cmpld	cr6, rWORD5, rWORD6
+ 	bne	cr1, L(duLcr1)
+-	srd	rG, rWORD8, rSHR
+-	sld	rB, rWORD8, rSHL
+-	or	rWORD8, rG, rH
++	srd	r12, rWORD8, rSHR
++	sld	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
+ 	blt	cr7, L(duP3x)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 24(rSTR1)
+ 	ld	rWORD2, 24(rSTR2)
++#endif
+ 	cmpld	cr5, rWORD7, rWORD8
+ 	bne	cr6, L(duLcr6)
+-	srd	rA, rWORD2, rSHR
+-	sld	rD, rWORD2, rSHL
+-	or	rWORD2, rA, rB
++	srd	r0, rWORD2, rSHR
++	sld	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 16
+ 	addi	rSTR2, rSTR2, 16
+-	cmpld	cr0, rWORD1, rWORD2
++#endif
++	cmpld	cr7, rWORD1, rWORD2
+ 	b	L(duLoop1)
+-	.align 4
++	.align	4
+ L(duP3x):
++#ifndef __LITTLE_ENDIAN__
+ 	addi	rSTR1, rSTR1, 16
+ 	addi	rSTR2, rSTR2, 16
++#endif
++#if 0
++/* Huh?  We've already branched on cr1!  */
+ 	bne	cr1, L(duLcr1)
++#endif
+ 	cmpld	cr5, rWORD7, rWORD8
+ 	bne	cr6, L(duLcr6)
+ 	sldi.	rN, rN, 3
+ 	bne	cr5, L(duLcr5)
+ 	cmpld	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA, 0
++	li	r0, 0
+ 	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD2, 8(rSTR2)
+-	srd	rA, rWORD2, rSHR
++#endif
++	srd	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+-	
++
+ /* Count is a multiple of 32, remainder is 0 */
+-	.align 4
++	.align	4
+ L(duP4):
+-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+-	srd	rA, rWORD8, rSHR
++	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
++	srd	r0, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	addi	rSTR1, rSTR1, 8
++#else
+ 	ld	rWORD1, 0(rSTR1)
+-	sld	rD, rWORD8, rSHL
+-	or	rWORD2, rA, rH
++#endif
++	sld	rWORD2_SHIFT, rWORD8, rSHL
++	or	rWORD2, r0, rWORD6_SHIFT
+ L(duP4e):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD3, 8(rSTR1)
+ 	ld	rWORD4, 8(rSTR2)
+-	cmpld	cr0, rWORD1, rWORD2
+-	srd	rC, rWORD4, rSHR
+-	sld	rF, rWORD4, rSHL
+-	or	rWORD4, rC, rD
++#endif
++	cmpld	cr7, rWORD1, rWORD2
++	srd	r12, rWORD4, rSHR
++	sld	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD5, 16(rSTR1)
+ 	ld	rWORD6, 16(rSTR2)
++#endif
+ 	cmpld	cr1, rWORD3, rWORD4
+-	bne	cr0, L(duLcr0)
+-	srd	rE, rWORD6, rSHR
+-	sld	rH, rWORD6, rSHL
+-	or	rWORD6, rE, rF
++	bne	cr7, L(duLcr7)
++	srd	r0, rWORD6, rSHR
++	sld	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ldu	rWORD7, 24(rSTR1)
+ 	ldu	rWORD8, 24(rSTR2)
++#endif
+ 	cmpld	cr6, rWORD5, rWORD6
+ 	bne	cr1, L(duLcr1)
+-	srd	rG, rWORD8, rSHR
+-	sld	rB, rWORD8, rSHL
+-	or	rWORD8, rG, rH
++	srd	r12, rWORD8, rSHR
++	sld	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
+ 	cmpld	cr5, rWORD7, rWORD8
+ 	bdz-	L(du24)		/* Adjust CTR as we start with +4 */
+ /* This is the primary loop */
+-	.align 4
++	.align	4
+ L(duLoop):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD1, 8(rSTR1)
+ 	ld	rWORD2, 8(rSTR2)
++#endif
+ 	cmpld	cr1, rWORD3, rWORD4
+ 	bne	cr6, L(duLcr6)
+-	srd	rA, rWORD2, rSHR
+-	sld	rD, rWORD2, rSHL
+-	or	rWORD2, rA, rB
++	srd	r0, rWORD2, rSHR
++	sld	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
+ L(duLoop1):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD3, 16(rSTR1)
+ 	ld	rWORD4, 16(rSTR2)
++#endif
+ 	cmpld	cr6, rWORD5, rWORD6
+ 	bne	cr5, L(duLcr5)
+-	srd	rC, rWORD4, rSHR
+-	sld	rF, rWORD4, rSHL
+-	or	rWORD4, rC, rD
++	srd	r12, rWORD4, rSHR
++	sld	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
+ L(duLoop2):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD5, 24(rSTR1)
+ 	ld	rWORD6, 24(rSTR2)
++#endif
+ 	cmpld	cr5, rWORD7, rWORD8
+-	bne	cr0, L(duLcr0)
+-	srd	rE, rWORD6, rSHR
+-	sld	rH, rWORD6, rSHL
+-	or	rWORD6, rE, rF
++	bne	cr7, L(duLcr7)
++	srd	r0, rWORD6, rSHR
++	sld	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
+ L(duLoop3):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ldu	rWORD7, 32(rSTR1)
+ 	ldu	rWORD8, 32(rSTR2)
+-	cmpld	cr0, rWORD1, rWORD2
++#endif
++	cmpld	cr7, rWORD1, rWORD2
+ 	bne-	cr1, L(duLcr1)
+-	srd	rG, rWORD8, rSHR
+-	sld	rB, rWORD8, rSHL
+-	or	rWORD8, rG, rH
+-	bdnz+	L(duLoop)	
+-	
++	srd	r12, rWORD8, rSHR
++	sld	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
++	bdnz+	L(duLoop)
++
+ L(duL4):
++#if 0
++/* Huh?  We've already branched on cr1!  */
+ 	bne	cr1, L(duLcr1)
++#endif
+ 	cmpld	cr1, rWORD3, rWORD4
+ 	bne	cr6, L(duLcr6)
+ 	cmpld	cr6, rWORD5, rWORD6
+ 	bne	cr5, L(duLcr5)
+ 	cmpld	cr5, rWORD7, rWORD8
+ L(du44):
+-	bne	cr0, L(duLcr0)
++	bne	cr7, L(duLcr7)
+ L(du34):
+ 	bne	cr1, L(duLcr1)
+ L(du24):
+@@ -876,106 +1253,113 @@
+ 	sldi.	rN, rN, 3
+ 	bne	cr5, L(duLcr5)
+ /* At this point we have a remainder of 1 to 7 bytes to compare.  We use
+-   shift right double to elliminate bits beyond the compare length. 
+-   This allows the use of double word subtract to compute the final
+-   result.
++   shift right double to eliminate bits beyond the compare length.
+ 
+-   However it may not be safe to load rWORD2 which may be beyond the 
++   However it may not be safe to load rWORD2 which may be beyond the
+    string length. So we compare the bit length of the remainder to
+    the right shift count (rSHR). If the bit count is less than or equal
+    we do not need to load rWORD2 (all significant bits are already in
+-   rB).  */
++   rWORD8_SHIFT).  */
+ 	cmpld	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA, 0
++	li	r0, 0
+ 	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
+ 	ld	rWORD2, 8(rSTR2)
+-	srd	rA, rWORD2, rSHR
+-	.align 4
++#endif
++	srd	r0, rWORD2, rSHR
++	.align	4
+ L(dutrim):
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++#else
+ 	ld	rWORD1, 8(rSTR1)
+-	ld	rWORD8,-8(r1)
+-	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */ 
+-	or	rWORD2, rA, rB
+-	ld	rWORD7,-16(r1)	
+-	ld	r29,-24(r1)
++#endif
++	ld	rWORD8, -8(r1)
++	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
++	or	rWORD2, r0, rWORD8_SHIFT
++	ld	rWORD7, -16(r1)
++	ld	rSHL, -24(r1)
+ 	srd	rWORD1, rWORD1, rN
+ 	srd	rWORD2, rWORD2, rN
+-	ld	r28,-32(r1)	
+-	ld	r27,-40(r1)
++	ld	rSHR, -32(r1)
++	ld	rWORD8_SHIFT, -40(r1)
+ 	li	rRTN, 0
+-	cmpld	cr0, rWORD1, rWORD2	
+-	ld	r26,-48(r1)
+-	ld	r25,-56(r1)
+- 	beq	cr0, L(dureturn24)
+-	li	rRTN, 1
+-	ld	r24,-64(r1)
+-	bgtlr	cr0
+-	li	rRTN, -1
+-	blr
+-	.align 4
+-L(duLcr0):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+-	li	rRTN, 1
+-	bgt	cr0, L(dureturn29)	
+-	ld	r29,-24(r1)
+-	ld	r28,-32(r1)
++	cmpld	cr7, rWORD1, rWORD2
++	ld	rWORD2_SHIFT, -48(r1)
++	ld	rWORD4_SHIFT, -56(r1)
++	beq	cr7, L(dureturn24)
++	li	rRTN, 1
++	ld	rWORD6_SHIFT, -64(r1)
++	bgtlr	cr7
++	li	rRTN, -1
++	blr
++	.align	4
++L(duLcr7):
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++	li	rRTN, 1
++	bgt	cr7, L(dureturn29)
++	ld	rSHL, -24(r1)
++	ld	rSHR, -32(r1)
+ 	li	rRTN, -1
+ 	b	L(dureturn27)
+-	.align 4
++	.align	4
+ L(duLcr1):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
+ 	li	rRTN, 1
+-	bgt	cr1, L(dureturn29)	
+-	ld	r29,-24(r1)
+-	ld	r28,-32(r1)
++	bgt	cr1, L(dureturn29)
++	ld	rSHL, -24(r1)
++	ld	rSHR, -32(r1)
+ 	li	rRTN, -1
+ 	b	L(dureturn27)
+-	.align 4
++	.align	4
+ L(duLcr6):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
+ 	li	rRTN, 1
+-	bgt	cr6, L(dureturn29)	
+-	ld	r29,-24(r1)
+-	ld	r28,-32(r1)
++	bgt	cr6, L(dureturn29)
++	ld	rSHL, -24(r1)
++	ld	rSHR, -32(r1)
+ 	li	rRTN, -1
+ 	b	L(dureturn27)
+-	.align 4
++	.align	4
+ L(duLcr5):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
+ 	li	rRTN, 1
+-	bgt	cr5, L(dureturn29)	
+-	ld	r29,-24(r1)
+-	ld	r28,-32(r1)
++	bgt	cr5, L(dureturn29)
++	ld	rSHL, -24(r1)
++	ld	rSHR, -32(r1)
+ 	li	rRTN, -1
+ 	b	L(dureturn27)
+ 	.align	3
+ L(duZeroReturn):
+-	li	rRTN,0
++	li	rRTN, 0
+ 	.align	4
+ L(dureturn):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+-L(dureturn29):	
+-	ld	r29,-24(r1)
+-	ld	r28,-32(r1)
+-L(dureturn27):	
+-	ld	r27,-40(r1)
+-L(dureturn26):	
+-	ld	r26,-48(r1)
+-L(dureturn25):	
+-	ld	r25,-56(r1)
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++L(dureturn29):
++	ld	rSHL, -24(r1)
++	ld	rSHR, -32(r1)
++L(dureturn27):
++	ld	rWORD8_SHIFT, -40(r1)
++L(dureturn26):
++	ld	rWORD2_SHIFT, -48(r1)
++L(dureturn25):
++	ld	rWORD4_SHIFT, -56(r1)
+ L(dureturn24):
+-	ld	r24,-64(r1)
++	ld	rWORD6_SHIFT, -64(r1)
+ 	blr
+ L(duzeroLength):
+-	li	rRTN,0
++	li	rRTN, 0
+ 	blr
+ 
+-END (BP_SYM (memcmp))
++END (memcmp)
+ libc_hidden_builtin_def (memcmp)
+ weak_alias (memcmp, bcmp)
+diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcmp.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcmp.S
+--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcmp.S	2014-05-28 19:22:37.000000000 -0500
++++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/memcmp.S	2014-05-29 09:35:08.000000000 -0500
+@@ -1,5 +1,5 @@
+ /* Optimized memcmp implementation for POWER7/PowerPC64.
+-   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
++   Copyright (C) 2010-2014 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+@@ -17,379 +17,576 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+-#include <bp-sym.h>
+-#include <bp-asm.h>
+ 
+ /* int [r3] memcmp (const char *s1 [r3],
+ 		    const char *s2 [r4],
+ 		    size_t size [r5])  */
+ 
+ 	.machine power7
+-EALIGN (BP_SYM(memcmp),4,0)
++EALIGN (memcmp, 4, 0)
+ 	CALL_MCOUNT 3
+ 
+-#define rTMP	r0
+ #define rRTN	r3
+ #define rSTR1	r3	/* first string arg */
+ #define rSTR2	r4	/* second string arg */
+ #define rN	r5	/* max string length */
+-/* Note:  The Bounded pointer support in this code is broken.  This code
+-   was inherited from PPC32 and that support was never completed.
+-   Current PPC gcc does not support -fbounds-check or -fbounded-pointers.  */
+ #define rWORD1	r6	/* current word in s1 */
+ #define rWORD2	r7	/* current word in s2 */
+ #define rWORD3	r8	/* next word in s1 */
+ #define rWORD4	r9	/* next word in s2 */
+ #define rWORD5	r10	/* next word in s1 */
+ #define rWORD6	r11	/* next word in s2 */
+-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
+ #define rWORD7	r30	/* next word in s1 */
+ #define rWORD8	r31	/* next word in s2 */
+ 
+-	xor	rTMP,rSTR2,rSTR1
+-	cmpldi	cr6,rN,0
+-	cmpldi	cr1,rN,12
+-	clrldi.	rTMP,rTMP,61
+-	clrldi	rBITDIF,rSTR1,61
+-	cmpldi	cr5,rBITDIF,0
+-	beq-	cr6,L(zeroLength)
+-	dcbt	0,rSTR1
+-	dcbt	0,rSTR2
+-/* If less than 8 bytes or not aligned, use the unalligned
++	xor	r0, rSTR2, rSTR1
++	cmpldi	cr6, rN, 0
++	cmpldi	cr1, rN, 12
++	clrldi.	r0, r0, 61
++	clrldi	r12, rSTR1, 61
++	cmpldi	cr5, r12, 0
++	beq-	cr6, L(zeroLength)
++	dcbt	0, rSTR1
++	dcbt	0, rSTR2
++/* If less than 8 bytes or not aligned, use the unaligned
+    byte loop.  */
+-	blt	cr1,L(bytealigned)
+-	std	rWORD8,-8(r1)
+-	cfi_offset(rWORD8,-8)
+-	std	rWORD7,-16(r1)
+-	cfi_offset(rWORD7,-16)
++	blt	cr1, L(bytealigned)
++	std	rWORD8, -8(r1)
++	cfi_offset(rWORD8, -8)
++	std	rWORD7, -16(r1)
++	cfi_offset(rWORD7, -16)
+ 	bne	L(unaligned)
+ /* At this point we know both strings have the same alignment and the
+-   compare length is at least 8 bytes.  rBITDIF containes the low order
++   compare length is at least 8 bytes.  r12 contains the low order
+    3 bits of rSTR1 and cr5 contains the result of the logical compare
+-   of rBITDIF to 0.  If rBITDIF == 0 then we are already double word
+-   aligned and can perform the DWaligned loop.
++   of r12 to 0.  If r12 == 0 then we are already double word
++   aligned and can perform the DW aligned loop.
+ 
+    Otherwise we know the two strings have the same alignment (but not
+-   yet DW).  So we can force the string addresses to the next lower DW
+-   boundary and special case this first DW word using shift left to
+-   ellimiate bits preceeding the first byte.  Since we want to join the
+-   normal (DWaligned) compare loop, starting at the second double word,
++   yet DW).  So we force the string addresses to the next lower DW
++   boundary and special case this first DW using shift left to
++   eliminate bits preceding the first byte.  Since we want to join the
++   normal (DW aligned) compare loop, starting at the second double word,
+    we need to adjust the length (rN) and special case the loop
+-   versioning for the first DW. This insures that the loop count is
+-   correct and the first DW (shifted) is in the expected resister pair.  */
++   versioning for the first DW. This ensures that the loop count is
++   correct and the first DW (shifted) is in the expected register pair.  */
+ 	.align	4
+ L(samealignment):
+-	clrrdi	rSTR1,rSTR1,3
+-	clrrdi	rSTR2,rSTR2,3
+-	beq	cr5,L(DWaligned)
+-	add	rN,rN,rBITDIF
+-	sldi	r11,rBITDIF,3
+-	srdi	rTMP,rN,5	/* Divide by 32 */
+-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
+-	ld	rWORD1,0(rSTR1)
+-	ld	rWORD2,0(rSTR2)
+-	cmpldi	cr1,rBITDIF,16
+-	cmpldi	cr7,rN,32
+-	clrldi	rN,rN,61
++	clrrdi	rSTR1, rSTR1, 3
++	clrrdi	rSTR2, rSTR2, 3
++	beq	cr5, L(DWaligned)
++	add	rN, rN, r12
++	sldi	rWORD6, r12, 3
++	srdi	r0, rN, 5	/* Divide by 32 */
++	andi.	r12, rN, 24	/* Get the DW remainder */
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 0(rSTR1)
++	ld	rWORD2, 0(rSTR2)
++#endif
++	cmpldi	cr1, r12, 16
++	cmpldi	cr7, rN, 32
++	clrldi	rN, rN, 61
+ 	beq	L(dPs4)
+-	mtctr	rTMP
+-	bgt	cr1,L(dPs3)
+-	beq	cr1,L(dPs2)
++	mtctr	r0
++	bgt	cr1, L(dPs3)
++	beq	cr1, L(dPs2)
+ 
+ /* Remainder is 8 */
+ 	.align	3
+ L(dsP1):
+-	sld	rWORD5,rWORD1,r11
+-	sld	rWORD6,rWORD2,r11
+-	cmpld	cr5,rWORD5,rWORD6
+-	blt	cr7,L(dP1x)
++	sld	rWORD5, rWORD1, rWORD6
++	sld	rWORD6, rWORD2, rWORD6
++	cmpld	cr5, rWORD5, rWORD6
++	blt	cr7, L(dP1x)
+ /* Do something useful in this cycle since we have to branch anyway.  */
+-	ld	rWORD1,8(rSTR1)
+-	ld	rWORD2,8(rSTR2)
+-	cmpld	cr0,rWORD1,rWORD2
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 8(rSTR1)
++	ld	rWORD2, 8(rSTR2)
++#endif
++	cmpld	cr7, rWORD1, rWORD2
+ 	b	L(dP1e)
+ /* Remainder is 16 */
+ 	.align	4
+ L(dPs2):
+-	sld	rWORD5,rWORD1,r11
+-	sld	rWORD6,rWORD2,r11
+-	cmpld	cr6,rWORD5,rWORD6
+-	blt	cr7,L(dP2x)
++	sld	rWORD5, rWORD1, rWORD6
++	sld	rWORD6, rWORD2, rWORD6
++	cmpld	cr6, rWORD5, rWORD6
++	blt	cr7, L(dP2x)
+ /* Do something useful in this cycle since we have to branch anyway.  */
+-	ld	rWORD7,8(rSTR1)
+-	ld	rWORD8,8(rSTR2)
+-	cmpld	cr5,rWORD7,rWORD8
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD7, 8(rSTR1)
++	ld	rWORD8, 8(rSTR2)
++#endif
++	cmpld	cr5, rWORD7, rWORD8
+ 	b	L(dP2e)
+ /* Remainder is 24 */
+ 	.align	4
+ L(dPs3):
+-	sld	rWORD3,rWORD1,r11
+-	sld	rWORD4,rWORD2,r11
+-	cmpld	cr1,rWORD3,rWORD4
++	sld	rWORD3, rWORD1, rWORD6
++	sld	rWORD4, rWORD2, rWORD6
++	cmpld	cr1, rWORD3, rWORD4
+ 	b	L(dP3e)
+ /* Count is a multiple of 32, remainder is 0 */
+ 	.align	4
+ L(dPs4):
+-	mtctr	rTMP
+-	sld	rWORD1,rWORD1,r11
+-	sld	rWORD2,rWORD2,r11
+-	cmpld	cr0,rWORD1,rWORD2
++	mtctr	r0
++	sld	rWORD1, rWORD1, rWORD6
++	sld	rWORD2, rWORD2, rWORD6
++	cmpld	cr7, rWORD1, rWORD2
+ 	b	L(dP4e)
+ 
+ /* At this point we know both strings are double word aligned and the
+    compare length is at least 8 bytes.  */
+ 	.align	4
+ L(DWaligned):
+-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
+-	srdi	rTMP,rN,5	/* Divide by 32 */
+-	cmpldi	cr1,rBITDIF,16
+-	cmpldi	cr7,rN,32
+-	clrldi	rN,rN,61
++	andi.	r12, rN, 24	/* Get the DW remainder */
++	srdi	r0, rN, 5	/* Divide by 32 */
++	cmpldi	cr1, r12, 16
++	cmpldi	cr7, rN, 32
++	clrldi	rN, rN, 61
+ 	beq	L(dP4)
+-	bgt	cr1,L(dP3)
+-	beq	cr1,L(dP2)
++	bgt	cr1, L(dP3)
++	beq	cr1, L(dP2)
+ 
+ /* Remainder is 8 */
+ 	.align	4
+ L(dP1):
+-	mtctr	rTMP
++	mtctr	r0
+ /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+-   (8-15 byte compare), we want to use only volitile registers.  This
+-   means we can avoid restoring non-volitile registers since we did not
++   (8-15 byte compare), we want to use only volatile registers.  This
++   means we can avoid restoring non-volatile registers since we did not
+    change any on the early exit path.  The key here is the non-early
+    exit path only cares about the condition code (cr5), not about which
+    register pair was used.  */
+-	ld	rWORD5,0(rSTR1)
+-	ld	rWORD6,0(rSTR2)
+-	cmpld	cr5,rWORD5,rWORD6
+-	blt	cr7,L(dP1x)
+-	ld	rWORD1,8(rSTR1)
+-	ld	rWORD2,8(rSTR2)
+-	cmpld	cr0,rWORD1,rWORD2
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD5, 0(rSTR1)
++	ld	rWORD6, 0(rSTR2)
++#endif
++	cmpld	cr5, rWORD5, rWORD6
++	blt	cr7, L(dP1x)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 8(rSTR1)
++	ld	rWORD2, 8(rSTR2)
++#endif
++	cmpld	cr7, rWORD1, rWORD2
+ L(dP1e):
+-	ld	rWORD3,16(rSTR1)
+-	ld	rWORD4,16(rSTR2)
+-	cmpld	cr1,rWORD3,rWORD4
+-	ld	rWORD5,24(rSTR1)
+-	ld	rWORD6,24(rSTR2)
+-	cmpld	cr6,rWORD5,rWORD6
+-	bne	cr5,L(dLcr5)
+-	bne	cr0,L(dLcr0)
+-
+-	ldu	rWORD7,32(rSTR1)
+-	ldu	rWORD8,32(rSTR2)
+-	bne	cr1,L(dLcr1)
+-	cmpld	cr5,rWORD7,rWORD8
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD3, 16(rSTR1)
++	ld	rWORD4, 16(rSTR2)
++#endif
++	cmpld	cr1, rWORD3, rWORD4
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD5, 24(rSTR1)
++	ld	rWORD6, 24(rSTR2)
++#endif
++	cmpld	cr6, rWORD5, rWORD6
++	bne	cr5, L(dLcr5x)
++	bne	cr7, L(dLcr7x)
++
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ldu	rWORD7, 32(rSTR1)
++	ldu	rWORD8, 32(rSTR2)
++#endif
++	bne	cr1, L(dLcr1)
++	cmpld	cr5, rWORD7, rWORD8
+ 	bdnz	L(dLoop)
+-	bne	cr6,L(dLcr6)
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++	bne	cr6, L(dLcr6)
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
+ 	.align	3
+ L(dP1x):
+-	sldi.	r12,rN,3
+-	bne	cr5,L(dLcr5)
+-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
++	sldi.	r12, rN, 3
++	bne	cr5, L(dLcr5x)
++	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+ 	bne	L(d00)
+-	li	rRTN,0
++	li	rRTN, 0
+ 	blr
+ 
+ /* Remainder is 16 */
+ 	.align	4
+ L(dP2):
+-	mtctr	rTMP
+-	ld	rWORD5,0(rSTR1)
+-	ld	rWORD6,0(rSTR2)
+-	cmpld	cr6,rWORD5,rWORD6
+-	blt	cr7,L(dP2x)
+-	ld	rWORD7,8(rSTR1)
+-	ld	rWORD8,8(rSTR2)
+-	cmpld	cr5,rWORD7,rWORD8
++	mtctr	r0
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD5, 0(rSTR1)
++	ld	rWORD6, 0(rSTR2)
++#endif
++	cmpld	cr6, rWORD5, rWORD6
++	blt	cr7, L(dP2x)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD7, 8(rSTR1)
++	ld	rWORD8, 8(rSTR2)
++#endif
++	cmpld	cr5, rWORD7, rWORD8
+ L(dP2e):
+-	ld	rWORD1,16(rSTR1)
+-	ld	rWORD2,16(rSTR2)
+-	cmpld	cr0,rWORD1,rWORD2
+-	ld	rWORD3,24(rSTR1)
+-	ld	rWORD4,24(rSTR2)
+-	cmpld	cr1,rWORD3,rWORD4
+-	addi	rSTR1,rSTR1,8
+-	addi	rSTR2,rSTR2,8
+-	bne	cr6,L(dLcr6)
+-	bne	cr5,L(dLcr5)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 16(rSTR1)
++	ld	rWORD2, 16(rSTR2)
++#endif
++	cmpld	cr7, rWORD1, rWORD2
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD3, 24(rSTR1)
++	ld	rWORD4, 24(rSTR2)
++#endif
++	cmpld	cr1, rWORD3, rWORD4
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#endif
++	bne	cr6, L(dLcr6)
++	bne	cr5, L(dLcr5)
+ 	b	L(dLoop2)
+ /* Again we are on a early exit path (16-23 byte compare), we want to
+-   only use volitile registers and avoid restoring non-volitile
++   only use volatile registers and avoid restoring non-volatile
+    registers.  */
+ 	.align	4
+ L(dP2x):
+-	ld	rWORD3,8(rSTR1)
+-	ld	rWORD4,8(rSTR2)
+-	cmpld	cr5,rWORD3,rWORD4
+-	sldi.	r12,rN,3
+-	bne	cr6,L(dLcr6)
+-	addi	rSTR1,rSTR1,8
+-	addi	rSTR2,rSTR2,8
+-	bne	cr5,L(dLcr5)
+-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD3, 8(rSTR1)
++	ld	rWORD4, 8(rSTR2)
++#endif
++	cmpld	cr1, rWORD3, rWORD4
++	sldi.	r12, rN, 3
++	bne	cr6, L(dLcr6x)
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#endif
++	bne	cr1, L(dLcr1x)
++	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+ 	bne	L(d00)
+-	li	rRTN,0
++	li	rRTN, 0
+ 	blr
+ 
+ /* Remainder is 24 */
+ 	.align	4
+ L(dP3):
+-	mtctr	rTMP
+-	ld	rWORD3,0(rSTR1)
+-	ld	rWORD4,0(rSTR2)
+-	cmpld	cr1,rWORD3,rWORD4
++	mtctr	r0
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD3, 0(rSTR1)
++	ld	rWORD4, 0(rSTR2)
++#endif
++	cmpld	cr1, rWORD3, rWORD4
+ L(dP3e):
+-	ld	rWORD5,8(rSTR1)
+-	ld	rWORD6,8(rSTR2)
+-	cmpld	cr6,rWORD5,rWORD6
+-	blt	cr7,L(dP3x)
+-	ld	rWORD7,16(rSTR1)
+-	ld	rWORD8,16(rSTR2)
+-	cmpld	cr5,rWORD7,rWORD8
+-	ld	rWORD1,24(rSTR1)
+-	ld	rWORD2,24(rSTR2)
+-	cmpld	cr0,rWORD1,rWORD2
+-	addi	rSTR1,rSTR1,16
+-	addi	rSTR2,rSTR2,16
+-	bne	cr1,L(dLcr1)
+-	bne	cr6,L(dLcr6)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD5, 8(rSTR1)
++	ld	rWORD6, 8(rSTR2)
++#endif
++	cmpld	cr6, rWORD5, rWORD6
++	blt	cr7, L(dP3x)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD7, 16(rSTR1)
++	ld	rWORD8, 16(rSTR2)
++#endif
++	cmpld	cr5, rWORD7, rWORD8
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 24(rSTR1)
++	ld	rWORD2, 24(rSTR2)
++#endif
++	cmpld	cr7, rWORD1, rWORD2
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 16
++	addi	rSTR2, rSTR2, 16
++#endif
++	bne	cr1, L(dLcr1)
++	bne	cr6, L(dLcr6)
+ 	b	L(dLoop1)
+ /* Again we are on a early exit path (24-31 byte compare), we want to
+-   only use volitile registers and avoid restoring non-volitile
++   only use volatile registers and avoid restoring non-volatile
+    registers.  */
+ 	.align	4
+ L(dP3x):
+-	ld	rWORD1,16(rSTR1)
+-	ld	rWORD2,16(rSTR2)
+-	cmpld	cr5,rWORD1,rWORD2
+-	sldi.	r12,rN,3
+-	bne	cr1,L(dLcr1)
+-	addi	rSTR1,rSTR1,16
+-	addi	rSTR2,rSTR2,16
+-	bne	cr6,L(dLcr6)
+-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
+-	bne	cr5,L(dLcr5)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 16(rSTR1)
++	ld	rWORD2, 16(rSTR2)
++#endif
++	cmpld	cr7, rWORD1, rWORD2
++	sldi.	r12, rN, 3
++	bne	cr1, L(dLcr1x)
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 16
++	addi	rSTR2, rSTR2, 16
++#endif
++	bne	cr6, L(dLcr6x)
++	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
++	bne	cr7, L(dLcr7x)
+ 	bne	L(d00)
+-	li	rRTN,0
++	li	rRTN, 0
+ 	blr
+ 
+ /* Count is a multiple of 32, remainder is 0 */
+ 	.align	4
+ L(dP4):
+-	mtctr	rTMP
+-	ld	rWORD1,0(rSTR1)
+-	ld	rWORD2,0(rSTR2)
+-	cmpld	cr0,rWORD1,rWORD2
++	mtctr	r0
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 0(rSTR1)
++	ld	rWORD2, 0(rSTR2)
++#endif
++	cmpld	cr7, rWORD1, rWORD2
+ L(dP4e):
+-	ld	rWORD3,8(rSTR1)
+-	ld	rWORD4,8(rSTR2)
+-	cmpld	cr1,rWORD3,rWORD4
+-	ld	rWORD5,16(rSTR1)
+-	ld	rWORD6,16(rSTR2)
+-	cmpld	cr6,rWORD5,rWORD6
+-	ldu	rWORD7,24(rSTR1)
+-	ldu	rWORD8,24(rSTR2)
+-	cmpld	cr5,rWORD7,rWORD8
+-	bne	cr0,L(dLcr0)
+-	bne	cr1,L(dLcr1)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD3, 8(rSTR1)
++	ld	rWORD4, 8(rSTR2)
++#endif
++	cmpld	cr1, rWORD3, rWORD4
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD5, 16(rSTR1)
++	ld	rWORD6, 16(rSTR2)
++#endif
++	cmpld	cr6, rWORD5, rWORD6
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ldu	rWORD7, 24(rSTR1)
++	ldu	rWORD8, 24(rSTR2)
++#endif
++	cmpld	cr5, rWORD7, rWORD8
++	bne	cr7, L(dLcr7)
++	bne	cr1, L(dLcr1)
+ 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
+ /* This is the primary loop */
+ 	.align	4
+ L(dLoop):
+-	ld	rWORD1,8(rSTR1)
+-	ld	rWORD2,8(rSTR2)
+-	cmpld	cr1,rWORD3,rWORD4
+-	bne	cr6,L(dLcr6)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 8(rSTR1)
++	ld	rWORD2, 8(rSTR2)
++#endif
++	cmpld	cr1, rWORD3, rWORD4
++	bne	cr6, L(dLcr6)
+ L(dLoop1):
+-	ld	rWORD3,16(rSTR1)
+-	ld	rWORD4,16(rSTR2)
+-	cmpld	cr6,rWORD5,rWORD6
+-	bne	cr5,L(dLcr5)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD3, 16(rSTR1)
++	ld	rWORD4, 16(rSTR2)
++#endif
++	cmpld	cr6, rWORD5, rWORD6
++	bne	cr5, L(dLcr5)
+ L(dLoop2):
+-	ld	rWORD5,24(rSTR1)
+-	ld	rWORD6,24(rSTR2)
+-	cmpld	cr5,rWORD7,rWORD8
+-	bne	cr0,L(dLcr0)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD5, 24(rSTR1)
++	ld	rWORD6, 24(rSTR2)
++#endif
++	cmpld	cr5, rWORD7, rWORD8
++	bne	cr7, L(dLcr7)
+ L(dLoop3):
+-	ldu	rWORD7,32(rSTR1)
+-	ldu	rWORD8,32(rSTR2)
+-	bne	cr1,L(dLcr1)
+-	cmpld	cr0,rWORD1,rWORD2
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ldu	rWORD7, 32(rSTR1)
++	ldu	rWORD8, 32(rSTR2)
++#endif
++	bne	cr1, L(dLcr1)
++	cmpld	cr7, rWORD1, rWORD2
+ 	bdnz	L(dLoop)
+ 
+ L(dL4):
+-	cmpld	cr1,rWORD3,rWORD4
+-	bne	cr6,L(dLcr6)
+-	cmpld	cr6,rWORD5,rWORD6
+-	bne	cr5,L(dLcr5)
+-	cmpld	cr5,rWORD7,rWORD8
++	cmpld	cr1, rWORD3, rWORD4
++	bne	cr6, L(dLcr6)
++	cmpld	cr6, rWORD5, rWORD6
++	bne	cr5, L(dLcr5)
++	cmpld	cr5, rWORD7, rWORD8
+ L(d44):
+-	bne	cr0,L(dLcr0)
++	bne	cr7, L(dLcr7)
+ L(d34):
+-	bne	cr1,L(dLcr1)
++	bne	cr1, L(dLcr1)
+ L(d24):
+-	bne	cr6,L(dLcr6)
++	bne	cr6, L(dLcr6)
+ L(d14):
+-	sldi.	r12,rN,3
+-	bne	cr5,L(dLcr5)
++	sldi.	r12, rN, 3
++	bne	cr5, L(dLcr5)
+ L(d04):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+ 	beq	L(zeroLength)
+ /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
+    we are aligned it is safe to load the whole double word, and use
+-   shift right double to elliminate bits beyond the compare length.  */
++   shift right double to eliminate bits beyond the compare length.  */
+ L(d00):
+-	ld	rWORD1,8(rSTR1)
+-	ld	rWORD2,8(rSTR2)
+-	srd	rWORD1,rWORD1,rN
+-	srd	rWORD2,rWORD2,rN
+-	cmpld	cr5,rWORD1,rWORD2
+-	bne	cr5,L(dLcr5x)
+-	li	rRTN,0
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 8(rSTR1)
++	ld	rWORD2, 8(rSTR2)
++#endif
++	srd	rWORD1, rWORD1, rN
++	srd	rWORD2, rWORD2, rN
++	cmpld	cr7, rWORD1, rWORD2
++	bne	cr7, L(dLcr7x)
++	li	rRTN, 0
+ 	blr
++
+ 	.align	4
+-L(dLcr0):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+-	li	rRTN,1
+-	bgtlr	cr0
+-	li	rRTN,-1
++L(dLcr7):
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++L(dLcr7x):
++	li	rRTN, 1
++	bgtlr	cr7
++	li	rRTN, -1
+ 	blr
+ 	.align	4
+ L(dLcr1):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+-	li	rRTN,1
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++L(dLcr1x):
++	li	rRTN, 1
+ 	bgtlr	cr1
+-	li	rRTN,-1
++	li	rRTN, -1
+ 	blr
+ 	.align	4
+ L(dLcr6):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+-	li	rRTN,1
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++L(dLcr6x):
++	li	rRTN, 1
+ 	bgtlr	cr6
+-	li	rRTN,-1
++	li	rRTN, -1
+ 	blr
+ 	.align	4
+ L(dLcr5):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
+ L(dLcr5x):
+-	li	rRTN,1
++	li	rRTN, 1
+ 	bgtlr	cr5
+-	li	rRTN,-1
++	li	rRTN, -1
+ 	blr
+ 
+ 	.align	4
+ L(bytealigned):
+ 	mtctr	rN
+-	beq	cr6,L(zeroLength)
++#if 0
++/* Huh?  We've already branched on cr6!  */
++	beq	cr6, L(zeroLength)
++#endif
+ 
+ /* We need to prime this loop.  This loop is swing modulo scheduled
+    to avoid pipe delays.  The dependent instruction latencies (load to
+@@ -401,38 +598,38 @@
+    So we must precondition some registers and condition codes so that
+    we don't exit the loop early on the first iteration.  */
+ 
+-	lbz	rWORD1,0(rSTR1)
+-	lbz	rWORD2,0(rSTR2)
++	lbz	rWORD1, 0(rSTR1)
++	lbz	rWORD2, 0(rSTR2)
+ 	bdz	L(b11)
+-	cmpld	cr0,rWORD1,rWORD2
+-	lbz	rWORD3,1(rSTR1)
+-	lbz	rWORD4,1(rSTR2)
++	cmpld	cr7, rWORD1, rWORD2
++	lbz	rWORD3, 1(rSTR1)
++	lbz	rWORD4, 1(rSTR2)
+ 	bdz	L(b12)
+-	cmpld	cr1,rWORD3,rWORD4
+-	lbzu	rWORD5,2(rSTR1)
+-	lbzu	rWORD6,2(rSTR2)
++	cmpld	cr1, rWORD3, rWORD4
++	lbzu	rWORD5, 2(rSTR1)
++	lbzu	rWORD6, 2(rSTR2)
+ 	bdz	L(b13)
+ 	.align	4
+ L(bLoop):
+-	lbzu	rWORD1,1(rSTR1)
+-	lbzu	rWORD2,1(rSTR2)
+-	bne	cr0,L(bLcr0)
++	lbzu	rWORD1, 1(rSTR1)
++	lbzu	rWORD2, 1(rSTR2)
++	bne	cr7, L(bLcr7)
+ 
+-	cmpld	cr6,rWORD5,rWORD6
++	cmpld	cr6, rWORD5, rWORD6
+ 	bdz	L(b3i)
+ 
+-	lbzu	rWORD3,1(rSTR1)
+-	lbzu	rWORD4,1(rSTR2)
+-	bne	cr1,L(bLcr1)
++	lbzu	rWORD3, 1(rSTR1)
++	lbzu	rWORD4, 1(rSTR2)
++	bne	cr1, L(bLcr1)
+ 
+-	cmpld	cr0,rWORD1,rWORD2
++	cmpld	cr7, rWORD1, rWORD2
+ 	bdz	L(b2i)
+ 
+-	lbzu	rWORD5,1(rSTR1)
+-	lbzu	rWORD6,1(rSTR2)
+-	bne	cr6,L(bLcr6)
++	lbzu	rWORD5, 1(rSTR1)
++	lbzu	rWORD6, 1(rSTR2)
++	bne	cr6, L(bLcr6)
+ 
+-	cmpld	cr1,rWORD3,rWORD4
++	cmpld	cr1, rWORD3, rWORD4
+ 	bdnz	L(bLoop)
+ 
+ /* We speculatively loading bytes before we have tested the previous
+@@ -442,542 +639,727 @@
+    tested.  In this case we must complete the pending operations
+    before returning.  */
+ L(b1i):
+-	bne	cr0,L(bLcr0)
+-	bne	cr1,L(bLcr1)
++	bne	cr7, L(bLcr7)
++	bne	cr1, L(bLcr1)
+ 	b	L(bx56)
+ 	.align	4
+ L(b2i):
+-	bne	cr6,L(bLcr6)
+-	bne	cr0,L(bLcr0)
++	bne	cr6, L(bLcr6)
++	bne	cr7, L(bLcr7)
+ 	b	L(bx34)
+ 	.align	4
+ L(b3i):
+-	bne	cr1,L(bLcr1)
+-	bne	cr6,L(bLcr6)
++	bne	cr1, L(bLcr1)
++	bne	cr6, L(bLcr6)
+ 	b	L(bx12)
+ 	.align	4
+-L(bLcr0):
+-	li	rRTN,1
+-	bgtlr	cr0
+-	li	rRTN,-1
++L(bLcr7):
++	li	rRTN, 1
++	bgtlr	cr7
++	li	rRTN, -1
+ 	blr
+ L(bLcr1):
+-	li	rRTN,1
++	li	rRTN, 1
+ 	bgtlr	cr1
+-	li	rRTN,-1
++	li	rRTN, -1
+ 	blr
+ L(bLcr6):
+-	li	rRTN,1
++	li	rRTN, 1
+ 	bgtlr	cr6
+-	li	rRTN,-1
++	li	rRTN, -1
+ 	blr
+ 
+ L(b13):
+-	bne	cr0,L(bx12)
+-	bne	cr1,L(bx34)
++	bne	cr7, L(bx12)
++	bne	cr1, L(bx34)
+ L(bx56):
+-	sub	rRTN,rWORD5,rWORD6
++	sub	rRTN, rWORD5, rWORD6
+ 	blr
+ 	nop
+ L(b12):
+-	bne	cr0,L(bx12)
++	bne	cr7, L(bx12)
+ L(bx34):
+-	sub	rRTN,rWORD3,rWORD4
++	sub	rRTN, rWORD3, rWORD4
+ 	blr
+ L(b11):
+ L(bx12):
+-	sub	rRTN,rWORD1,rWORD2
++	sub	rRTN, rWORD1, rWORD2
+ 	blr
+ 	.align	4
+-L(zeroLengthReturn):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+ L(zeroLength):
+-	li	rRTN,0
++	li	rRTN, 0
+ 	blr
+ 
+ 	.align	4
+ /* At this point we know the strings have different alignment and the
+-   compare length is at least 8 bytes.  rBITDIF containes the low order
++   compare length is at least 8 bytes.  r12 contains the low order
+    3 bits of rSTR1 and cr5 contains the result of the logical compare
+-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is double word
++   of r12 to 0.  If r12 == 0 then rStr1 is double word
+    aligned and can perform the DWunaligned loop.
+ 
+-   Otherwise we know that rSTR1 is not aready DW aligned yet.
++   Otherwise we know that rSTR1 is not already DW aligned yet.
+    So we can force the string addresses to the next lower DW
+-   boundary and special case this first DW word using shift left to
+-   ellimiate bits preceeding the first byte.  Since we want to join the
++   boundary and special case this first DW using shift left to
++   eliminate bits preceding the first byte.  Since we want to join the
+    normal (DWaligned) compare loop, starting at the second double word,
+    we need to adjust the length (rN) and special case the loop
+-   versioning for the first DW. This insures that the loop count is
++   versioning for the first DW. This ensures that the loop count is
+    correct and the first DW (shifted) is in the expected resister pair.  */
+-#define rSHL	r29	/* Unaligned shift left count.  */
+-#define rSHR	r28	/* Unaligned shift right count.  */
+-#define rB		r27	/* Left rotation temp for rWORD2.  */
+-#define rD		r26	/* Left rotation temp for rWORD4.  */
+-#define rF		r25	/* Left rotation temp for rWORD6.  */
+-#define rH		r24	/* Left rotation temp for rWORD8.  */
+-#define rA		r0	/* Right rotation temp for rWORD2.  */
+-#define rC		r12	/* Right rotation temp for rWORD4.  */
+-#define rE		r0	/* Right rotation temp for rWORD6.  */
+-#define rG		r12	/* Right rotation temp for rWORD8.  */
++#define rSHL		r29	/* Unaligned shift left count.  */
++#define rSHR		r28	/* Unaligned shift right count.  */
++#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
++#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
++#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
++#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+ L(unaligned):
+-	std	r29,-24(r1)
+-	cfi_offset(r29,-24)
+-	clrldi	rSHL,rSTR2,61
+-	beq	cr6,L(duzeroLength)
+-	std	r28,-32(r1)
+-	cfi_offset(r28,-32)
+-	beq	cr5,L(DWunaligned)
+-	std	r27,-40(r1)
+-	cfi_offset(r27,-40)
+-/* Adjust the logical start of rSTR2 ro compensate for the extra bits
++	std	rSHL, -24(r1)
++	cfi_offset(rSHL, -24)
++	clrldi	rSHL, rSTR2, 61
++	beq	cr6, L(duzeroLength)
++	std	rSHR, -32(r1)
++	cfi_offset(rSHR, -32)
++	beq	cr5, L(DWunaligned)
++	std	rWORD8_SHIFT, -40(r1)
++	cfi_offset(rWORD8_SHIFT, -40)
++/* Adjust the logical start of rSTR2 to compensate for the extra bits
+    in the 1st rSTR1 DW.  */
+-	sub	r27,rSTR2,rBITDIF
++	sub	rWORD8_SHIFT, rSTR2, r12
+ /* But do not attempt to address the DW before that DW that contains
+    the actual start of rSTR2.  */
+-	clrrdi	rSTR2,rSTR2,3
+-	std	r26,-48(r1)
+-	cfi_offset(r26,-48)
+-/* Compute the leaft/right shift counts for the unalign rSTR2,
++	clrrdi	rSTR2, rSTR2, 3
++	std	rWORD2_SHIFT, -48(r1)
++	cfi_offset(rWORD2_SHIFT, -48)
++/* Compute the left/right shift counts for the unaligned rSTR2,
+    compensating for the logical (DW aligned) start of rSTR1.  */
+-	clrldi	rSHL,r27,61
+-	clrrdi	rSTR1,rSTR1,3
+-	std	r25,-56(r1)
+-	cfi_offset(r25,-56)
+-	sldi	rSHL,rSHL,3
+-	cmpld	cr5,r27,rSTR2
+-	add	rN,rN,rBITDIF
+-	sldi	r11,rBITDIF,3
+-	std	r24,-64(r1)
+-	cfi_offset(r24,-64)
+-	subfic	rSHR,rSHL,64
+-	srdi	rTMP,rN,5	/* Divide by 32 */
+-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
++	clrldi	rSHL, rWORD8_SHIFT, 61
++	clrrdi	rSTR1, rSTR1, 3
++	std	rWORD4_SHIFT, -56(r1)
++	cfi_offset(rWORD4_SHIFT, -56)
++	sldi	rSHL, rSHL, 3
++	cmpld	cr5, rWORD8_SHIFT, rSTR2
++	add	rN, rN, r12
++	sldi	rWORD6, r12, 3
++	std	rWORD6_SHIFT, -64(r1)
++	cfi_offset(rWORD6_SHIFT, -64)
++	subfic	rSHR, rSHL, 64
++	srdi	r0, rN, 5	/* Divide by 32 */
++	andi.	r12, rN, 24	/* Get the DW remainder */
+ /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
+    this special case those bits may be discarded anyway.  Also we
+    must avoid loading a DW where none of the bits are part of rSTR2 as
+    this may cross a page boundary and cause a page fault.  */
+-	li	rWORD8,0
+-	blt	cr5,L(dus0)
+-	ld	rWORD8,0(rSTR2)
+-	la	rSTR2,8(rSTR2)
+-	sld	rWORD8,rWORD8,rSHL
++	li	rWORD8, 0
++	blt	cr5, L(dus0)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD8, 0(rSTR2)
++	addi	rSTR2, rSTR2, 8
++#endif
++	sld	rWORD8, rWORD8, rSHL
+ 
+ L(dus0):
+-	ld	rWORD1,0(rSTR1)
+-	ld	rWORD2,0(rSTR2)
+-	cmpldi	cr1,rBITDIF,16
+-	cmpldi	cr7,rN,32
+-	srd	rG,rWORD2,rSHR
+-	clrldi	rN,rN,61
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 0(rSTR1)
++	ld	rWORD2, 0(rSTR2)
++#endif
++	cmpldi	cr1, r12, 16
++	cmpldi	cr7, rN, 32
++	srd	r12, rWORD2, rSHR
++	clrldi	rN, rN, 61
+ 	beq	L(duPs4)
+-	mtctr	rTMP
+-	or	rWORD8,rG,rWORD8
+-	bgt	cr1,L(duPs3)
+-	beq	cr1,L(duPs2)
++	mtctr	r0
++	or	rWORD8, r12, rWORD8
++	bgt	cr1, L(duPs3)
++	beq	cr1, L(duPs2)
+ 
+ /* Remainder is 8 */
+ 	.align	4
+ L(dusP1):
+-	sld	rB,rWORD2,rSHL
+-	sld	rWORD7,rWORD1,r11
+-	sld	rWORD8,rWORD8,r11
+-	bge	cr7,L(duP1e)
++	sld	rWORD8_SHIFT, rWORD2, rSHL
++	sld	rWORD7, rWORD1, rWORD6
++	sld	rWORD8, rWORD8, rWORD6
++	bge	cr7, L(duP1e)
+ /* At this point we exit early with the first double word compare
+    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+    how we handle the remaining bytes.  */
+-	cmpld	cr5,rWORD7,rWORD8
+-	sldi.	rN,rN,3
+-	bne	cr5,L(duLcr5)
+-	cmpld	cr7,rN,rSHR
++	cmpld	cr5, rWORD7, rWORD8
++	sldi.	rN, rN, 3
++	bne	cr5, L(duLcr5)
++	cmpld	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA,0
+-	ble	cr7,L(dutrim)
+-	ld	rWORD2,8(rSTR2)
+-	srd	rA,rWORD2,rSHR
++	li	r0, 0
++	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD2, 8(rSTR2)
++#endif
++	srd	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ /* Remainder is 16 */
+ 	.align	4
+ L(duPs2):
+-	sld	rH,rWORD2,rSHL
+-	sld	rWORD5,rWORD1,r11
+-	sld	rWORD6,rWORD8,r11
++	sld	rWORD6_SHIFT, rWORD2, rSHL
++	sld	rWORD5, rWORD1, rWORD6
++	sld	rWORD6, rWORD8, rWORD6
+ 	b	L(duP2e)
+ /* Remainder is 24 */
+ 	.align	4
+ L(duPs3):
+-	sld	rF,rWORD2,rSHL
+-	sld	rWORD3,rWORD1,r11
+-	sld	rWORD4,rWORD8,r11
++	sld	rWORD4_SHIFT, rWORD2, rSHL
++	sld	rWORD3, rWORD1, rWORD6
++	sld	rWORD4, rWORD8, rWORD6
+ 	b	L(duP3e)
+ /* Count is a multiple of 32, remainder is 0 */
+ 	.align	4
+ L(duPs4):
+-	mtctr	rTMP
+-	or	rWORD8,rG,rWORD8
+-	sld	rD,rWORD2,rSHL
+-	sld	rWORD1,rWORD1,r11
+-	sld	rWORD2,rWORD8,r11
++	mtctr	r0
++	or	rWORD8, r12, rWORD8
++	sld	rWORD2_SHIFT, rWORD2, rSHL
++	sld	rWORD1, rWORD1, rWORD6
++	sld	rWORD2, rWORD8, rWORD6
+ 	b	L(duP4e)
+ 
+ /* At this point we know rSTR1 is double word aligned and the
+    compare length is at least 8 bytes.  */
+ 	.align	4
+ L(DWunaligned):
+-	std	r27,-40(r1)
+-	cfi_offset(r27,-40)
+-	clrrdi	rSTR2,rSTR2,3
+-	std	r26,-48(r1)
+-	cfi_offset(r26,-48)
+-	srdi	rTMP,rN,5	/* Divide by 32 */
+-	std	r25,-56(r1)
+-	cfi_offset(r25,-56)
+-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
+-	std	r24,-64(r1)
+-	cfi_offset(r24,-64)
+-	sldi	rSHL,rSHL,3
+-	ld	rWORD6,0(rSTR2)
+-	ldu	rWORD8,8(rSTR2)
+-	cmpldi	cr1,rBITDIF,16
+-	cmpldi	cr7,rN,32
+-	clrldi	rN,rN,61
+-	subfic	rSHR,rSHL,64
+-	sld	rH,rWORD6,rSHL
++	std	rWORD8_SHIFT, -40(r1)
++	cfi_offset(rWORD8_SHIFT, -40)
++	clrrdi	rSTR2, rSTR2, 3
++	std	rWORD2_SHIFT, -48(r1)
++	cfi_offset(rWORD2_SHIFT, -48)
++	srdi	r0, rN, 5	/* Divide by 32 */
++	std	rWORD4_SHIFT, -56(r1)
++	cfi_offset(rWORD4_SHIFT, -56)
++	andi.	r12, rN, 24	/* Get the DW remainder */
++	std	rWORD6_SHIFT, -64(r1)
++	cfi_offset(rWORD6_SHIFT, -64)
++	sldi	rSHL, rSHL, 3
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD6, 0(rSTR2)
++	ldu	rWORD8, 8(rSTR2)
++#endif
++	cmpldi	cr1, r12, 16
++	cmpldi	cr7, rN, 32
++	clrldi	rN, rN, 61
++	subfic	rSHR, rSHL, 64
++	sld	rWORD6_SHIFT, rWORD6, rSHL
+ 	beq	L(duP4)
+-	mtctr	rTMP
+-	bgt	cr1,L(duP3)
+-	beq	cr1,L(duP2)
++	mtctr	r0
++	bgt	cr1, L(duP3)
++	beq	cr1, L(duP2)
+ 
+ /* Remainder is 8 */
+ 	.align	4
+ L(duP1):
+-	srd	rG,rWORD8,rSHR
+-	ld	rWORD7,0(rSTR1)
+-	sld	rB,rWORD8,rSHL
+-	or	rWORD8,rG,rH
+-	blt	cr7,L(duP1x)
++	srd	r12, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	addi	rSTR1, rSTR1, 8
++#else
++	ld	rWORD7, 0(rSTR1)
++#endif
++	sld	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
++	blt	cr7, L(duP1x)
+ L(duP1e):
+-	ld	rWORD1,8(rSTR1)
+-	ld	rWORD2,8(rSTR2)
+-	cmpld	cr5,rWORD7,rWORD8
+-	srd	rA,rWORD2,rSHR
+-	sld	rD,rWORD2,rSHL
+-	or	rWORD2,rA,rB
+-	ld	rWORD3,16(rSTR1)
+-	ld	rWORD4,16(rSTR2)
+-	cmpld	cr0,rWORD1,rWORD2
+-	srd	rC,rWORD4,rSHR
+-	sld	rF,rWORD4,rSHL
+-	bne	cr5,L(duLcr5)
+-	or	rWORD4,rC,rD
+-	ld	rWORD5,24(rSTR1)
+-	ld	rWORD6,24(rSTR2)
+-	cmpld	cr1,rWORD3,rWORD4
+-	srd	rE,rWORD6,rSHR
+-	sld	rH,rWORD6,rSHL
+-	bne	cr0,L(duLcr0)
+-	or	rWORD6,rE,rF
+-	cmpld	cr6,rWORD5,rWORD6
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 8(rSTR1)
++	ld	rWORD2, 8(rSTR2)
++#endif
++	cmpld	cr5, rWORD7, rWORD8
++	srd	r0, rWORD2, rSHR
++	sld	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD3, 16(rSTR1)
++	ld	rWORD4, 16(rSTR2)
++#endif
++	cmpld	cr7, rWORD1, rWORD2
++	srd	r12, rWORD4, rSHR
++	sld	rWORD4_SHIFT, rWORD4, rSHL
++	bne	cr5, L(duLcr5)
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD5, 24(rSTR1)
++	ld	rWORD6, 24(rSTR2)
++#endif
++	cmpld	cr1, rWORD3, rWORD4
++	srd	r0, rWORD6, rSHR
++	sld	rWORD6_SHIFT, rWORD6, rSHL
++	bne	cr7, L(duLcr7)
++	or	rWORD6, r0, rWORD4_SHIFT
++	cmpld	cr6, rWORD5, rWORD6
+ 	b	L(duLoop3)
+ 	.align	4
+ /* At this point we exit early with the first double word compare
+    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+    how we handle the remaining bytes.  */
+ L(duP1x):
+-	cmpld	cr5,rWORD7,rWORD8
+-	sldi.	rN,rN,3
+-	bne	cr5,L(duLcr5)
+-	cmpld	cr7,rN,rSHR
++	cmpld	cr5, rWORD7, rWORD8
++	sldi.	rN, rN, 3
++	bne	cr5, L(duLcr5)
++	cmpld	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA,0
+-	ble	cr7,L(dutrim)
+-	ld	rWORD2,8(rSTR2)
+-	srd	rA,rWORD2,rSHR
++	li	r0, 0
++	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD2, 8(rSTR2)
++#endif
++	srd	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ /* Remainder is 16 */
+ 	.align	4
+ L(duP2):
+-	srd	rE,rWORD8,rSHR
+-	ld	rWORD5,0(rSTR1)
+-	or	rWORD6,rE,rH
+-	sld	rH,rWORD8,rSHL
++	srd	r0, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	addi	rSTR1, rSTR1, 8
++#else
++	ld	rWORD5, 0(rSTR1)
++#endif
++	or	rWORD6, r0, rWORD6_SHIFT
++	sld	rWORD6_SHIFT, rWORD8, rSHL
+ L(duP2e):
+-	ld	rWORD7,8(rSTR1)
+-	ld	rWORD8,8(rSTR2)
+-	cmpld	cr6,rWORD5,rWORD6
+-	srd	rG,rWORD8,rSHR
+-	sld	rB,rWORD8,rSHL
+-	or	rWORD8,rG,rH
+-	blt	cr7,L(duP2x)
+-	ld	rWORD1,16(rSTR1)
+-	ld	rWORD2,16(rSTR2)
+-	cmpld	cr5,rWORD7,rWORD8
+-	bne	cr6,L(duLcr6)
+-	srd	rA,rWORD2,rSHR
+-	sld	rD,rWORD2,rSHL
+-	or	rWORD2,rA,rB
+-	ld	rWORD3,24(rSTR1)
+-	ld	rWORD4,24(rSTR2)
+-	cmpld	cr0,rWORD1,rWORD2
+-	bne	cr5,L(duLcr5)
+-	srd	rC,rWORD4,rSHR
+-	sld	rF,rWORD4,rSHL
+-	or	rWORD4,rC,rD
+-	addi	rSTR1,rSTR1,8
+-	addi	rSTR2,rSTR2,8
+-	cmpld	cr1,rWORD3,rWORD4
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD7, 8(rSTR1)
++	ld	rWORD8, 8(rSTR2)
++#endif
++	cmpld	cr6, rWORD5, rWORD6
++	srd	r12, rWORD8, rSHR
++	sld	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
++	blt	cr7, L(duP2x)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 16(rSTR1)
++	ld	rWORD2, 16(rSTR2)
++#endif
++	cmpld	cr5, rWORD7, rWORD8
++	bne	cr6, L(duLcr6)
++	srd	r0, rWORD2, rSHR
++	sld	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD3, 24(rSTR1)
++	ld	rWORD4, 24(rSTR2)
++#endif
++	cmpld	cr7, rWORD1, rWORD2
++	bne	cr5, L(duLcr5)
++	srd	r12, rWORD4, rSHR
++	sld	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#endif
++	cmpld	cr1, rWORD3, rWORD4
+ 	b	L(duLoop2)
+ 	.align	4
+ L(duP2x):
+-	cmpld	cr5,rWORD7,rWORD8
+-	addi	rSTR1,rSTR1,8
+-	addi	rSTR2,rSTR2,8
+-	bne	cr6,L(duLcr6)
+-	sldi.	rN,rN,3
+-	bne	cr5,L(duLcr5)
+-	cmpld	cr7,rN,rSHR
++	cmpld	cr5, rWORD7, rWORD8
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#endif
++	bne	cr6, L(duLcr6)
++	sldi.	rN, rN, 3
++	bne	cr5, L(duLcr5)
++	cmpld	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA,0
+-	ble	cr7,L(dutrim)
+-	ld	rWORD2,8(rSTR2)
+-	srd	rA,rWORD2,rSHR
++	li	r0, 0
++	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD2, 8(rSTR2)
++#endif
++	srd	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ 
+ /* Remainder is 24 */
+ 	.align	4
+ L(duP3):
+-	srd	rC,rWORD8,rSHR
+-	ld	rWORD3,0(rSTR1)
+-	sld	rF,rWORD8,rSHL
+-	or	rWORD4,rC,rH
++	srd	r12, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	addi	rSTR1, rSTR1, 8
++#else
++	ld	rWORD3, 0(rSTR1)
++#endif
++	sld	rWORD4_SHIFT, rWORD8, rSHL
++	or	rWORD4, r12, rWORD6_SHIFT
+ L(duP3e):
+-	ld	rWORD5,8(rSTR1)
+-	ld	rWORD6,8(rSTR2)
+-	cmpld	cr1,rWORD3,rWORD4
+-	srd	rE,rWORD6,rSHR
+-	sld	rH,rWORD6,rSHL
+-	or	rWORD6,rE,rF
+-	ld	rWORD7,16(rSTR1)
+-	ld	rWORD8,16(rSTR2)
+-	cmpld	cr6,rWORD5,rWORD6
+-	bne	cr1,L(duLcr1)
+-	srd	rG,rWORD8,rSHR
+-	sld	rB,rWORD8,rSHL
+-	or	rWORD8,rG,rH
+-	blt	cr7,L(duP3x)
+-	ld	rWORD1,24(rSTR1)
+-	ld	rWORD2,24(rSTR2)
+-	cmpld	cr5,rWORD7,rWORD8
+-	bne	cr6,L(duLcr6)
+-	srd	rA,rWORD2,rSHR
+-	sld	rD,rWORD2,rSHL
+-	or	rWORD2,rA,rB
+-	addi	rSTR1,rSTR1,16
+-	addi	rSTR2,rSTR2,16
+-	cmpld	cr0,rWORD1,rWORD2
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD5, 8(rSTR1)
++	ld	rWORD6, 8(rSTR2)
++#endif
++	cmpld	cr1, rWORD3, rWORD4
++	srd	r0, rWORD6, rSHR
++	sld	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD7, 16(rSTR1)
++	ld	rWORD8, 16(rSTR2)
++#endif
++	cmpld	cr6, rWORD5, rWORD6
++	bne	cr1, L(duLcr1)
++	srd	r12, rWORD8, rSHR
++	sld	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
++	blt	cr7, L(duP3x)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 24(rSTR1)
++	ld	rWORD2, 24(rSTR2)
++#endif
++	cmpld	cr5, rWORD7, rWORD8
++	bne	cr6, L(duLcr6)
++	srd	r0, rWORD2, rSHR
++	sld	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 16
++	addi	rSTR2, rSTR2, 16
++#endif
++	cmpld	cr7, rWORD1, rWORD2
+ 	b	L(duLoop1)
+ 	.align	4
+ L(duP3x):
+-	addi	rSTR1,rSTR1,16
+-	addi	rSTR2,rSTR2,16
+-	bne	cr1,L(duLcr1)
+-	cmpld	cr5,rWORD7,rWORD8
+-	bne	cr6,L(duLcr6)
+-	sldi.	rN,rN,3
+-	bne	cr5,L(duLcr5)
+-	cmpld	cr7,rN,rSHR
++#ifndef __LITTLE_ENDIAN__
++	addi	rSTR1, rSTR1, 16
++	addi	rSTR2, rSTR2, 16
++#endif
++#if 0
++/* Huh?  We've already branched on cr1!  */
++	bne	cr1, L(duLcr1)
++#endif
++	cmpld	cr5, rWORD7, rWORD8
++	bne	cr6, L(duLcr6)
++	sldi.	rN, rN, 3
++	bne	cr5, L(duLcr5)
++	cmpld	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA,0
+-	ble	cr7,L(dutrim)
+-	ld	rWORD2,8(rSTR2)
+-	srd	rA,rWORD2,rSHR
++	li	r0, 0
++	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD2, 8(rSTR2)
++#endif
++	srd	r0, rWORD2, rSHR
+ 	b	L(dutrim)
+ 
+ /* Count is a multiple of 32, remainder is 0 */
+ 	.align	4
+ L(duP4):
+-	mtctr	rTMP
+-	srd	rA,rWORD8,rSHR
+-	ld	rWORD1,0(rSTR1)
+-	sld	rD,rWORD8,rSHL
+-	or	rWORD2,rA,rH
++	mtctr	r0
++	srd	r0, rWORD8, rSHR
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	addi	rSTR1, rSTR1, 8
++#else
++	ld	rWORD1, 0(rSTR1)
++#endif
++	sld	rWORD2_SHIFT, rWORD8, rSHL
++	or	rWORD2, r0, rWORD6_SHIFT
+ L(duP4e):
+-	ld	rWORD3,8(rSTR1)
+-	ld	rWORD4,8(rSTR2)
+-	cmpld	cr0,rWORD1,rWORD2
+-	srd	rC,rWORD4,rSHR
+-	sld	rF,rWORD4,rSHL
+-	or	rWORD4,rC,rD
+-	ld	rWORD5,16(rSTR1)
+-	ld	rWORD6,16(rSTR2)
+-	cmpld	cr1,rWORD3,rWORD4
+-	bne	cr0,L(duLcr0)
+-	srd	rE,rWORD6,rSHR
+-	sld	rH,rWORD6,rSHL
+-	or	rWORD6,rE,rF
+-	ldu	rWORD7,24(rSTR1)
+-	ldu	rWORD8,24(rSTR2)
+-	cmpld	cr6,rWORD5,rWORD6
+-	bne	cr1,L(duLcr1)
+-	srd	rG,rWORD8,rSHR
+-	sld	rB,rWORD8,rSHL
+-	or	rWORD8,rG,rH
+-	cmpld	cr5,rWORD7,rWORD8
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD3, 8(rSTR1)
++	ld	rWORD4, 8(rSTR2)
++#endif
++	cmpld	cr7, rWORD1, rWORD2
++	srd	r12, rWORD4, rSHR
++	sld	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD5, 16(rSTR1)
++	ld	rWORD6, 16(rSTR2)
++#endif
++	cmpld	cr1, rWORD3, rWORD4
++	bne	cr7, L(duLcr7)
++	srd	r0, rWORD6, rSHR
++	sld	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ldu	rWORD7, 24(rSTR1)
++	ldu	rWORD8, 24(rSTR2)
++#endif
++	cmpld	cr6, rWORD5, rWORD6
++	bne	cr1, L(duLcr1)
++	srd	r12, rWORD8, rSHR
++	sld	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
++	cmpld	cr5, rWORD7, rWORD8
+ 	bdz	L(du24)		/* Adjust CTR as we start with +4 */
+ /* This is the primary loop */
+ 	.align	4
+ L(duLoop):
+-	ld	rWORD1,8(rSTR1)
+-	ld	rWORD2,8(rSTR2)
+-	cmpld	cr1,rWORD3,rWORD4
+-	bne	cr6,L(duLcr6)
+-	srd	rA,rWORD2,rSHR
+-	sld	rD,rWORD2,rSHL
+-	or	rWORD2,rA,rB
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD1, 8(rSTR1)
++	ld	rWORD2, 8(rSTR2)
++#endif
++	cmpld	cr1, rWORD3, rWORD4
++	bne	cr6, L(duLcr6)
++	srd	r0, rWORD2, rSHR
++	sld	rWORD2_SHIFT, rWORD2, rSHL
++	or	rWORD2, r0, rWORD8_SHIFT
+ L(duLoop1):
+-	ld	rWORD3,16(rSTR1)
+-	ld	rWORD4,16(rSTR2)
+-	cmpld	cr6,rWORD5,rWORD6
+-	bne	cr5,L(duLcr5)
+-	srd	rC,rWORD4,rSHR
+-	sld	rF,rWORD4,rSHL
+-	or	rWORD4,rC,rD
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD3, 0, rSTR1
++	ldbrx	rWORD4, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD3, 16(rSTR1)
++	ld	rWORD4, 16(rSTR2)
++#endif
++	cmpld	cr6, rWORD5, rWORD6
++	bne	cr5, L(duLcr5)
++	srd	r12, rWORD4, rSHR
++	sld	rWORD4_SHIFT, rWORD4, rSHL
++	or	rWORD4, r12, rWORD2_SHIFT
+ L(duLoop2):
+-	ld	rWORD5,24(rSTR1)
+-	ld	rWORD6,24(rSTR2)
+-	cmpld	cr5,rWORD7,rWORD8
+-	bne	cr0,L(duLcr0)
+-	srd	rE,rWORD6,rSHR
+-	sld	rH,rWORD6,rSHL
+-	or	rWORD6,rE,rF
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD5, 0, rSTR1
++	ldbrx	rWORD6, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD5, 24(rSTR1)
++	ld	rWORD6, 24(rSTR2)
++#endif
++	cmpld	cr5, rWORD7, rWORD8
++	bne	cr7, L(duLcr7)
++	srd	r0, rWORD6, rSHR
++	sld	rWORD6_SHIFT, rWORD6, rSHL
++	or	rWORD6, r0, rWORD4_SHIFT
+ L(duLoop3):
+-	ldu	rWORD7,32(rSTR1)
+-	ldu	rWORD8,32(rSTR2)
+-	cmpld	cr0,rWORD1,rWORD2
+-	bne-	cr1,L(duLcr1)
+-	srd	rG,rWORD8,rSHR
+-	sld	rB,rWORD8,rSHL
+-	or	rWORD8,rG,rH
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD7, 0, rSTR1
++	ldbrx	rWORD8, 0, rSTR2
++	addi	rSTR1, rSTR1, 8
++	addi	rSTR2, rSTR2, 8
++#else
++	ldu	rWORD7, 32(rSTR1)
++	ldu	rWORD8, 32(rSTR2)
++#endif
++	cmpld	cr7, rWORD1, rWORD2
++	bne	cr1, L(duLcr1)
++	srd	r12, rWORD8, rSHR
++	sld	rWORD8_SHIFT, rWORD8, rSHL
++	or	rWORD8, r12, rWORD6_SHIFT
+ 	bdnz	L(duLoop)
+ 
+ L(duL4):
+-	bne	cr1,L(duLcr1)
+-	cmpld	cr1,rWORD3,rWORD4
+-	bne	cr6,L(duLcr6)
+-	cmpld	cr6,rWORD5,rWORD6
+-	bne	cr5,L(duLcr5)
+-	cmpld	cr5,rWORD7,rWORD8
++#if 0
++/* Huh?  We've already branched on cr1!  */
++	bne	cr1, L(duLcr1)
++#endif
++	cmpld	cr1, rWORD3, rWORD4
++	bne	cr6, L(duLcr6)
++	cmpld	cr6, rWORD5, rWORD6
++	bne	cr5, L(duLcr5)
++	cmpld	cr5, rWORD7, rWORD8
+ L(du44):
+-	bne	cr0,L(duLcr0)
++	bne	cr7, L(duLcr7)
+ L(du34):
+-	bne	cr1,L(duLcr1)
++	bne	cr1, L(duLcr1)
+ L(du24):
+-	bne	cr6,L(duLcr6)
++	bne	cr6, L(duLcr6)
+ L(du14):
+-	sldi.	rN,rN,3
+-	bne	cr5,L(duLcr5)
++	sldi.	rN, rN, 3
++	bne	cr5, L(duLcr5)
+ /* At this point we have a remainder of 1 to 7 bytes to compare.  We use
+-   shift right double to elliminate bits beyond the compare length.
+-   This allows the use of double word subtract to compute the final
+-   result.
++   shift right double to eliminate bits beyond the compare length.
+ 
+    However it may not be safe to load rWORD2 which may be beyond the
+    string length. So we compare the bit length of the remainder to
+    the right shift count (rSHR). If the bit count is less than or equal
+    we do not need to load rWORD2 (all significant bits are already in
+-   rB).  */
+-	cmpld	cr7,rN,rSHR
++   rWORD8_SHIFT).  */
++	cmpld	cr7, rN, rSHR
+ 	beq	L(duZeroReturn)
+-	li	rA,0
+-	ble	cr7,L(dutrim)
+-	ld	rWORD2,8(rSTR2)
+-	srd	rA,rWORD2,rSHR
++	li	r0, 0
++	ble	cr7, L(dutrim)
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD2, 0, rSTR2
++	addi	rSTR2, rSTR2, 8
++#else
++	ld	rWORD2, 8(rSTR2)
++#endif
++	srd	r0, rWORD2, rSHR
+ 	.align	4
+ L(dutrim):
+-	ld	rWORD1,8(rSTR1)
+-	ld	rWORD8,-8(r1)
+-	subfic	rN,rN,64	/* Shift count is 64 - (rN * 8).  */
+-	or	rWORD2,rA,rB
+-	ld	rWORD7,-16(r1)
+-	ld	r29,-24(r1)
+-	srd	rWORD1,rWORD1,rN
+-	srd	rWORD2,rWORD2,rN
+-	ld	r28,-32(r1)
+-	ld	r27,-40(r1)
+-	li	rRTN,0
+-	cmpld	cr0,rWORD1,rWORD2
+-	ld	r26,-48(r1)
+-	ld	r25,-56(r1)
+-	beq	cr0,L(dureturn24)
+-	li	rRTN,1
+-	ld	r24,-64(r1)
+-	bgtlr	cr0
+-	li	rRTN,-1
+-	blr
+-	.align	4
+-L(duLcr0):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+-	li	rRTN,1
+-	bgt	cr0,L(dureturn29)
+-	ld	r29,-24(r1)
+-	ld	r28,-32(r1)
+-	li	rRTN,-1
++#ifdef __LITTLE_ENDIAN__
++	ldbrx	rWORD1, 0, rSTR1
++#else
++	ld	rWORD1, 8(rSTR1)
++#endif
++	ld	rWORD8, -8(r1)
++	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
++	or	rWORD2, r0, rWORD8_SHIFT
++	ld	rWORD7, -16(r1)
++	ld	rSHL, -24(r1)
++	srd	rWORD1, rWORD1, rN
++	srd	rWORD2, rWORD2, rN
++	ld	rSHR, -32(r1)
++	ld	rWORD8_SHIFT, -40(r1)
++	li	rRTN, 0
++	cmpld	cr7, rWORD1, rWORD2
++	ld	rWORD2_SHIFT, -48(r1)
++	ld	rWORD4_SHIFT, -56(r1)
++	beq	cr7, L(dureturn24)
++	li	rRTN, 1
++	ld	rWORD6_SHIFT, -64(r1)
++	bgtlr	cr7
++	li	rRTN, -1
++	blr
++	.align	4
++L(duLcr7):
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++	li	rRTN, 1
++	bgt	cr7, L(dureturn29)
++	ld	rSHL, -24(r1)
++	ld	rSHR, -32(r1)
++	li	rRTN, -1
+ 	b	L(dureturn27)
+ 	.align	4
+ L(duLcr1):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+-	li	rRTN,1
+-	bgt	cr1,L(dureturn29)
+-	ld	r29,-24(r1)
+-	ld	r28,-32(r1)
+-	li	rRTN,-1
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++	li	rRTN, 1
++	bgt	cr1, L(dureturn29)
++	ld	rSHL, -24(r1)
++	ld	rSHR, -32(r1)
++	li	rRTN, -1
+ 	b	L(dureturn27)
+ 	.align	4
+ L(duLcr6):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+-	li	rRTN,1
+-	bgt	cr6,L(dureturn29)
+-	ld	r29,-24(r1)
+-	ld	r28,-32(r1)
+-	li	rRTN,-1
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++	li	rRTN, 1
++	bgt	cr6, L(dureturn29)
++	ld	rSHL, -24(r1)
++	ld	rSHR, -32(r1)
++	li	rRTN, -1
+ 	b	L(dureturn27)
+ 	.align	4
+ L(duLcr5):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
+-	li	rRTN,1
+-	bgt	cr5,L(dureturn29)
+-	ld	r29,-24(r1)
+-	ld	r28,-32(r1)
+-	li	rRTN,-1
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
++	li	rRTN, 1
++	bgt	cr5, L(dureturn29)
++	ld	rSHL, -24(r1)
++	ld	rSHR, -32(r1)
++	li	rRTN, -1
+ 	b	L(dureturn27)
+ 	.align	3
+ L(duZeroReturn):
+-	li	rRTN,0
++	li	rRTN, 0
+ 	.align	4
+ L(dureturn):
+-	ld	rWORD8,-8(r1)
+-	ld	rWORD7,-16(r1)
++	ld	rWORD8, -8(r1)
++	ld	rWORD7, -16(r1)
+ L(dureturn29):
+-	ld	r29,-24(r1)
+-	ld	r28,-32(r1)
++	ld	rSHL, -24(r1)
++	ld	rSHR, -32(r1)
+ L(dureturn27):
+-	ld	r27,-40(r1)
++	ld	rWORD8_SHIFT, -40(r1)
+ L(dureturn26):
+-	ld	r26,-48(r1)
++	ld	rWORD2_SHIFT, -48(r1)
+ L(dureturn25):
+-	ld	r25,-56(r1)
++	ld	rWORD4_SHIFT, -56(r1)
+ L(dureturn24):
+-	ld	r24,-64(r1)
++	ld	rWORD6_SHIFT, -64(r1)
+ 	blr
+ L(duzeroLength):
+-	li	rRTN,0
++	li	rRTN, 0
+ 	blr
+ 
+-END (BP_SYM (memcmp))
++END (memcmp)
+ libc_hidden_builtin_def (memcmp)
+-weak_alias (memcmp,bcmp)
++weak_alias (memcmp, bcmp)