diff --git a/arch/blackfin/lib/Makefile b/arch/blackfin/lib/Makefile index 635288fc5f54..bfdad52c570b 100644 --- a/arch/blackfin/lib/Makefile +++ b/arch/blackfin/lib/Makefile @@ -4,7 +4,7 @@ lib-y := \ ashldi3.o ashrdi3.o lshrdi3.o \ - muldi3.o divsi3.o udivsi3.o modsi3.o umodsi3.o \ + muldi3.o divsi3.o udivsi3.o udivdi3.o modsi3.o umodsi3.o \ checksum.o memcpy.o memset.o memcmp.o memchr.o memmove.o \ strcmp.o strcpy.o strncmp.o strncpy.o \ umulsi3_highpart.o smulsi3_highpart.o \ diff --git a/arch/blackfin/lib/udivdi3.S b/arch/blackfin/lib/udivdi3.S new file mode 100644 index 000000000000..ad1ebee675e1 --- /dev/null +++ b/arch/blackfin/lib/udivdi3.S @@ -0,0 +1,375 @@ +/* + * udivdi3.S - unsigned long long division + * + * Copyright 2003-2007 Analog Devices Inc. + * Enter bugs at http://blackfin.uclinux.org/ + * + * Licensed under the GPLv2 or later. + */ + +#include + +#define CARRY AC0 + +#ifdef CONFIG_ARITHMETIC_OPS_L1 +.section .l1.text +#else +.text +#endif + + +ENTRY(___udivdi3) + R3 = [SP + 12]; + [--SP] = (R7:4, P5:3); + + /* Attempt to use divide primitive first; these will handle + ** most cases, and they're quick - avoids stalls incurred by + ** testing for identities. + */ + + R4 = R2 | R3; + CC = R4 == 0; + IF CC JUMP .LDIV_BY_ZERO; + + R4.H = 0x8000; + R4 >>>= 16; // R4 now 0xFFFF8000 + R5 = R0 | R2; // If either dividend or + R4 = R5 & R4; // divisor have bits in + CC = R4; // top half or low half's sign + IF CC JUMP .LIDENTS; // bit, skip builtins. + R4 = R1 | R3; // Also check top halves + CC = R4; + IF CC JUMP .LIDENTS; + + /* Can use the builtins. */ + + AQ = CC; // Clear AQ (CC==0) + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + DIVQ(R0, R2); + R0 = R0.L (Z); + R1 = 0; + (R7:4, P5:3) = [SP++]; + RTS; + +.LIDENTS: + /* Test for common identities. Value to be returned is + ** placed in R6,R7. + */ + // Check for 0/y, return 0 + R4 = R0 | R1; + CC = R4 == 0; + IF CC JUMP .LRETURN_R0; + + // Check for x/x, return 1 + R6 = R0 - R2; // If x == y, then both R6 and R7 will be zero + R7 = R1 - R3; + R4 = R6 | R7; // making R4 zero. + R6 += 1; // which would now make R6:R7==1. + CC = R4 == 0; + IF CC JUMP .LRETURN_IDENT; + + // Check for x/1, return x + R6 = R0; + R7 = R1; + CC = R3 == 0; + IF !CC JUMP .Lnexttest; + CC = R2 == 1; + IF CC JUMP .LRETURN_IDENT; + +.Lnexttest: + R4.L = ONES R2; // check for div by power of two which + R5.L = ONES R3; // can be done using a shift + R6 = PACK (R5.L, R4.L); + CC = R6 == 1; + IF CC JUMP .Lpower_of_two_upper_zero; + R6 = PACK (R4.L, R5.L); + CC = R6 == 1; + IF CC JUMP .Lpower_of_two_lower_zero; + + // Check for x < y, return 0 + R6 = 0; + R7 = R6; + CC = R1 < R3 (IU); + IF CC JUMP .LRETURN_IDENT; + CC = R1 == R3; + IF !CC JUMP .Lno_idents; + CC = R0 < R2 (IU); + IF CC JUMP .LRETURN_IDENT; + +.Lno_idents: // Idents don't match. Go for the full operation + + + // If X, or X and Y have high bit set, it'll affect the + // results, so shift right one to stop this. Note: we've already + // checked that X >= Y, so Y's msb won't be set unless X's + // is. + + R4 = 0; + CC = R1 < 0; + IF !CC JUMP .Lx_msb_clear; + CC = !CC; // 1 -> 0; + R1 = ROT R1 BY -1; // Shift X >> 1 + R0 = ROT R0 BY -1; // lsb -> CC + BITSET(R4,31); // to record only x msb was set + CC = R3 < 0; + IF !CC JUMP .Ly_msb_clear; + CC = !CC; + R3 = ROT R3 BY -1; // Shift Y >> 1 + R2 = ROT R2 BY -1; + BITCLR(R4,31); // clear bit to record only x msb was set + +.Ly_msb_clear: +.Lx_msb_clear: + // Bit 31 in R4 indicates X msb set, but Y msb wasn't, and no bits + // were lost, so we should shift result left by one. + + [--SP] = R4; // save for later + + // In the loop that follows, each iteration we add + // either Y' or -Y' to the Remainder. We compute the + // negated Y', and store, for convenience. Y' goes + // into P0:P1, while -Y' goes into P2:P3. + + P0 = R2; + P1 = R3; + R2 = -R2; + CC = CARRY; + CC = !CC; + R4 = CC; + R3 = -R3; + R3 = R3 - R4; + + R6 = 0; // remainder = 0 + R7 = R6; + + [--SP] = R2; P2 = SP; + [--SP] = R3; P3 = SP; + [--SP] = R6; P5 = SP; // AQ = 0 + [--SP] = P1; + + /* In the loop that follows, we use the following + ** register assignments: + ** R0,R1 X, workspace + ** R2,R3 Y, workspace + ** R4,R5 partial Div + ** R6,R7 partial remainder + ** P5 AQ + ** The remainder and div form a 128-bit number, with + ** the remainder in the high 64-bits. + */ + R4 = R0; // Div = X' + R5 = R1; + R3 = 0; + + P4 = 64; // Iterate once per bit + LSETUP(.LULST,.LULEND) LC0 = P4; +.LULST: + /* Shift Div and remainder up by one. The bit shifted + ** out of the top of the quotient is shifted into the bottom + ** of the remainder. + */ + CC = R3; + R4 = ROT R4 BY 1; + R5 = ROT R5 BY 1 || // low q to high q + R2 = [P5]; // load saved AQ + R6 = ROT R6 BY 1 || // high q to low r + R0 = [P2]; // load -Y' + R7 = ROT R7 BY 1 || // low r to high r + R1 = [P3]; + + // Assume add -Y' + CC = R2 < 0; // But if AQ is set... + IF CC R0 = P0; // then add Y' instead + IF CC R1 = P1; + + R6 = R6 + R0; // Rem += (Y' or -Y') + CC = CARRY; + R0 = CC; + R7 = R7 + R1; + R7 = R7 + R0 (NS) || + R1 = [SP]; + // Set the next AQ bit + R1 = R7 ^ R1; // from Remainder and Y' + R1 = R1 >> 31 || // Negate AQ's value, and + [P5] = R1; // save next AQ + BITTGL(R1, 0); // add neg AQ to the Div +.LULEND: R4 = R4 + R1; + + R6 = [SP + 16]; + + R0 = R4; + R1 = R5; + CC = BITTST(R6,30); // Just set CC=0 + R4 = ROT R0 BY 1; // but if we had to shift X, + R5 = ROT R1 BY 1; // and didn't shift any bits out, + CC = BITTST(R6,31); // then the result will be half as + IF CC R0 = R4; // much as required, so shift left + IF CC R1 = R5; // one space. + + SP += 20; + (R7:4, P5:3) = [SP++]; + RTS; + +.Lpower_of_two: + /* Y has a single bit set, which means it's a power of two. + ** That means we can perform the division just by shifting + ** X to the right the appropriate number of bits + */ + + /* signbits returns the number of sign bits, minus one. + ** 1=>30, 2=>29, ..., 0x40000000=>0. Which means we need + ** to shift right n-signbits spaces. It also means 0x80000000 + ** is a special case, because that *also* gives a signbits of 0 + */ +.Lpower_of_two_lower_zero: + R7 = 0; + R6 = R1 >> 31; + CC = R3 < 0; + IF CC JUMP .LRETURN_IDENT; + + R2.L = SIGNBITS R3; + R2 = R2.L (Z); + R2 += -62; + (R7:4, P5:3) = [SP++]; + JUMP ___lshftli; + +.Lpower_of_two_upper_zero: + CC = R2 < 0; + IF CC JUMP .Lmaxint_shift; + + R2.L = SIGNBITS R2; + R2 = R2.L (Z); + R2 += -30; + (R7:4, P5:3) = [SP++]; + JUMP ___lshftli; + +.Lmaxint_shift: + R2 = -31; + (R7:4, P5:3) = [SP++]; + JUMP ___lshftli; + +.LRETURN_IDENT: + R0 = R6; + R1 = R7; +.LRETURN_R0: + (R7:4, P5:3) = [SP++]; + RTS; +.LDIV_BY_ZERO: + R0 = ~R2; + R1 = R0; + (R7:4, P5:3) = [SP++]; + RTS; + +ENDPROC(___udivdi3) + + +ENTRY(___lshftli) + CC = R2 == 0; + IF CC JUMP .Lfinished; // nothing to do + CC = R2 < 0; + IF CC JUMP .Lrshift; + R3 = 64; + CC = R2 < R3; + IF !CC JUMP .Lretzero; + + // We're shifting left, and it's less than 64 bits, so + // a valid result will be returned. + + R3 >>= 1; // R3 now 32 + CC = R2 < R3; + + IF !CC JUMP .Lzerohalf; + + // We're shifting left, between 1 and 31 bits, which means + // some of the low half will be shifted into the high half. + // Work out how much. + + R3 = R3 - R2; + + // Save that much data from the bottom half. + + P1 = R7; + R7 = R0; + R7 >>= R3; + + // Adjust both parts of the parameter. + + R0 <<= R2; + R1 <<= R2; + + // And include the bits moved across. + + R1 = R1 | R7; + R7 = P1; + RTS; + +.Lzerohalf: + // We're shifting left, between 32 and 63 bits, so the + // bottom half will become zero, and the top half will + // lose some bits. How many? + + R2 = R2 - R3; // N - 32 + R1 = LSHIFT R0 BY R2.L; + R0 = R0 - R0; + RTS; + +.Lretzero: + R0 = R0 - R0; + R1 = R0; +.Lfinished: + RTS; + +.Lrshift: + // We're shifting right, but by how much? + R2 = -R2; + R3 = 64; + CC = R2 < R3; + IF !CC JUMP .Lretzero; + + // Shifting right less than 64 bits, so some result bits will + // be retained. + + R3 >>= 1; // R3 now 32 + CC = R2 < R3; + IF !CC JUMP .Lsignhalf; + + // Shifting right between 1 and 31 bits, so need to copy + // data across words. + + P1 = R7; + R3 = R3 - R2; + R7 = R1; + R7 <<= R3; + R1 >>= R2; + R0 >>= R2; + R0 = R7 | R0; + R7 = P1; + RTS; + +.Lsignhalf: + // Shifting right between 32 and 63 bits, so the top half + // will become all zero-bits, and the bottom half is some + // of the top half. But how much? + + R2 = R2 - R3; + R0 = R1; + R0 >>= R2; + R1 = 0; + RTS; + +ENDPROC(___lshftli)