* $NetBSD: stan.sa,v 1.4 2000/03/13 23:52:32 soren Exp $ * MOTOROLA MICROPROCESSOR & MEMORY TECHNOLOGY GROUP * M68000 Hi-Performance Microprocessor Division * M68040 Software Package * * M68040 Software Package Copyright (c) 1993, 1994 Motorola Inc. * All rights reserved. * * THE SOFTWARE is provided on an "AS IS" basis and without warranty. * To the maximum extent permitted by applicable law, * MOTOROLA DISCLAIMS ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, * INCLUDING IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A * PARTICULAR PURPOSE and any warranty against infringement with * regard to the SOFTWARE (INCLUDING ANY MODIFIED VERSIONS THEREOF) * and any accompanying written materials. * * To the maximum extent permitted by applicable law, * IN NO EVENT SHALL MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER * (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS * PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR * OTHER PECUNIARY LOSS) ARISING OF THE USE OR INABILITY TO USE THE * SOFTWARE. Motorola assumes no responsibility for the maintenance * and support of the SOFTWARE. * * You are hereby granted a copyright license to use, modify, and * distribute the SOFTWARE so long as this entire notice is retained * without alteration in any modified and/or redistributed versions, * and that such modified versions are clearly identified as such. * No licenses are granted by implication, estoppel or otherwise * under any patents or trademarks of Motorola, Inc. * * stan.sa 3.3 7/29/91 * * The entry point stan computes the tangent of * an input argument; * stand does the same except for denormalized input. * * Input: Double-extended number X in location pointed to * by address register a0. * * Output: The value tan(X) returned in floating-point register Fp0. * * Accuracy and Monotonicity: The returned result is within 3 ulp in * 64 significant bit, i.e. within 0.5001 ulp to 53 bits if the * result is subsequently rounded to double precision. The * result is provably monotonic in double precision. * * Speed: The program sTAN takes approximately 170 cycles for * input argument X such that |X| < 15Pi, which is the usual * situation. * * Algorithm: * * 1. If |X| >= 15Pi or |X| < 2**(-40), go to 6. * * 2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let * k = N mod 2, so in particular, k = 0 or 1. * * 3. If k is odd, go to 5. * * 4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a * rational function U/V where * U = r + r*s*(P1 + s*(P2 + s*P3)), and * V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r. * Exit. * * 4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a * rational function U/V where * U = r + r*s*(P1 + s*(P2 + s*P3)), and * V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r, * -Cot(r) = -V/U. Exit. * * 6. If |X| > 1, go to 8. * * 7. (|X|<2**(-40)) Tan(X) = X. Exit. * * 8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2. * STAN IDNT 2,1 Motorola 040 Floating Point Software Package section 8 include fpsp.h BOUNDS1 DC.L $3FD78000,$4004BC7E TWOBYPI DC.L $3FE45F30,$6DC9C883 TANQ4 DC.L $3EA0B759,$F50F8688 TANP3 DC.L $BEF2BAA5,$A8924F04 TANQ3 DC.L $BF346F59,$B39BA65F,$00000000,$00000000 TANP2 DC.L $3FF60000,$E073D3FC,$199C4A00,$00000000 TANQ2 DC.L $3FF90000,$D23CD684,$15D95FA1,$00000000 TANP1 DC.L $BFFC0000,$8895A6C5,$FB423BCA,$00000000 TANQ1 DC.L $BFFD0000,$EEF57E0D,$A84BC8CE,$00000000 INVTWOPI DC.L $3FFC0000,$A2F9836E,$4E44152A,$00000000 TWOPI1 DC.L $40010000,$C90FDAA2,$00000000,$00000000 TWOPI2 DC.L $3FDF0000,$85A308D4,$00000000,$00000000 *--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING *--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT *--MOST 69 BITS LONG. xdef PITBL PITBL: DC.L $C0040000,$C90FDAA2,$2168C235,$21800000 DC.L $C0040000,$C2C75BCD,$105D7C23,$A0D00000 DC.L $C0040000,$BC7EDCF7,$FF523611,$A1E80000 DC.L $C0040000,$B6365E22,$EE46F000,$21480000 DC.L $C0040000,$AFEDDF4D,$DD3BA9EE,$A1200000 DC.L $C0040000,$A9A56078,$CC3063DD,$21FC0000 DC.L $C0040000,$A35CE1A3,$BB251DCB,$21100000 DC.L $C0040000,$9D1462CE,$AA19D7B9,$A1580000 DC.L $C0040000,$96CBE3F9,$990E91A8,$21E00000 DC.L $C0040000,$90836524,$88034B96,$20B00000 DC.L $C0040000,$8A3AE64F,$76F80584,$A1880000 DC.L $C0040000,$83F2677A,$65ECBF73,$21C40000 DC.L $C0030000,$FB53D14A,$A9C2F2C2,$20000000 DC.L $C0030000,$EEC2D3A0,$87AC669F,$21380000 DC.L $C0030000,$E231D5F6,$6595DA7B,$A1300000 DC.L $C0030000,$D5A0D84C,$437F4E58,$9FC00000 DC.L $C0030000,$C90FDAA2,$2168C235,$21000000 DC.L $C0030000,$BC7EDCF7,$FF523611,$A1680000 DC.L $C0030000,$AFEDDF4D,$DD3BA9EE,$A0A00000 DC.L $C0030000,$A35CE1A3,$BB251DCB,$20900000 DC.L $C0030000,$96CBE3F9,$990E91A8,$21600000 DC.L $C0030000,$8A3AE64F,$76F80584,$A1080000 DC.L $C0020000,$FB53D14A,$A9C2F2C2,$1F800000 DC.L $C0020000,$E231D5F6,$6595DA7B,$A0B00000 DC.L $C0020000,$C90FDAA2,$2168C235,$20800000 DC.L $C0020000,$AFEDDF4D,$DD3BA9EE,$A0200000 DC.L $C0020000,$96CBE3F9,$990E91A8,$20E00000 DC.L $C0010000,$FB53D14A,$A9C2F2C2,$1F000000 DC.L $C0010000,$C90FDAA2,$2168C235,$20000000 DC.L $C0010000,$96CBE3F9,$990E91A8,$20600000 DC.L $C0000000,$C90FDAA2,$2168C235,$1F800000 DC.L $BFFF0000,$C90FDAA2,$2168C235,$1F000000 DC.L $00000000,$00000000,$00000000,$00000000 DC.L $3FFF0000,$C90FDAA2,$2168C235,$9F000000 DC.L $40000000,$C90FDAA2,$2168C235,$9F800000 DC.L $40010000,$96CBE3F9,$990E91A8,$A0600000 DC.L $40010000,$C90FDAA2,$2168C235,$A0000000 DC.L $40010000,$FB53D14A,$A9C2F2C2,$9F000000 DC.L $40020000,$96CBE3F9,$990E91A8,$A0E00000 DC.L $40020000,$AFEDDF4D,$DD3BA9EE,$20200000 DC.L $40020000,$C90FDAA2,$2168C235,$A0800000 DC.L $40020000,$E231D5F6,$6595DA7B,$20B00000 DC.L $40020000,$FB53D14A,$A9C2F2C2,$9F800000 DC.L $40030000,$8A3AE64F,$76F80584,$21080000 DC.L $40030000,$96CBE3F9,$990E91A8,$A1600000 DC.L $40030000,$A35CE1A3,$BB251DCB,$A0900000 DC.L $40030000,$AFEDDF4D,$DD3BA9EE,$20A00000 DC.L $40030000,$BC7EDCF7,$FF523611,$21680000 DC.L $40030000,$C90FDAA2,$2168C235,$A1000000 DC.L $40030000,$D5A0D84C,$437F4E58,$1FC00000 DC.L $40030000,$E231D5F6,$6595DA7B,$21300000 DC.L $40030000,$EEC2D3A0,$87AC669F,$A1380000 DC.L $40030000,$FB53D14A,$A9C2F2C2,$A0000000 DC.L $40040000,$83F2677A,$65ECBF73,$A1C40000 DC.L $40040000,$8A3AE64F,$76F80584,$21880000 DC.L $40040000,$90836524,$88034B96,$A0B00000 DC.L $40040000,$96CBE3F9,$990E91A8,$A1E00000 DC.L $40040000,$9D1462CE,$AA19D7B9,$21580000 DC.L $40040000,$A35CE1A3,$BB251DCB,$A1100000 DC.L $40040000,$A9A56078,$CC3063DD,$A1FC0000 DC.L $40040000,$AFEDDF4D,$DD3BA9EE,$21200000 DC.L $40040000,$B6365E22,$EE46F000,$A1480000 DC.L $40040000,$BC7EDCF7,$FF523611,$21E80000 DC.L $40040000,$C2C75BCD,$105D7C23,$20D00000 DC.L $40040000,$C90FDAA2,$2168C235,$A1800000 INARG equ FP_SCR4 TWOTO63 equ L_SCR1 ENDFLAG equ L_SCR2 N equ L_SCR3 xref t_frcinx xref t_extdnrm xdef stand stand: *--TAN(X) = X FOR DENORMALIZED X bra t_extdnrm xdef stan stan: FMOVE.X (a0),FP0 ...LOAD INPUT MOVE.L (A0),D0 MOVE.W 4(A0),D0 ANDI.L #$7FFFFFFF,D0 CMPI.L #$3FD78000,D0 ...|X| >= 2**(-40)? BGE.B TANOK1 BRA.W TANSM TANOK1: CMPI.L #$4004BC7E,D0 ...|X| < 15 PI? BLT.B TANMAIN BRA.W REDUCEX TANMAIN: *--THIS IS THE USUAL CASE, |X| <= 15 PI. *--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP. FMOVE.X FP0,FP1 FMUL.D TWOBYPI,FP1 ...X*2/PI *--HIDE THE NEXT TWO INSTRUCTIONS lea.l PITBL+$200,a1 ...TABLE OF N*PI/2, N = -32,...,32 *--FP1 IS NOW READY FMOVE.L FP1,D0 ...CONVERT TO INTEGER ASL.L #4,D0 ADDA.L D0,a1 ...ADDRESS N*PIBY2 IN Y1, Y2 FSUB.X (a1)+,FP0 ...X-Y1 *--HIDE THE NEXT ONE FSUB.S (a1),FP0 ...FP0 IS R = (X-Y1)-Y2 ROR.L #5,D0 ANDI.L #$80000000,D0 ...D0 WAS ODD IFF D0 < 0 TANCONT: TST.L D0 BLT.W NODD FMOVE.X FP0,FP1 FMUL.X FP1,FP1 ...S = R*R FMOVE.D TANQ4,FP3 FMOVE.D TANP3,FP2 FMUL.X FP1,FP3 ...SQ4 FMUL.X FP1,FP2 ...SP3 FADD.D TANQ3,FP3 ...Q3+SQ4 FADD.X TANP2,FP2 ...P2+SP3 FMUL.X FP1,FP3 ...S(Q3+SQ4) FMUL.X FP1,FP2 ...S(P2+SP3) FADD.X TANQ2,FP3 ...Q2+S(Q3+SQ4) FADD.X TANP1,FP2 ...P1+S(P2+SP3) FMUL.X FP1,FP3 ...S(Q2+S(Q3+SQ4)) FMUL.X FP1,FP2 ...S(P1+S(P2+SP3)) FADD.X TANQ1,FP3 ...Q1+S(Q2+S(Q3+SQ4)) FMUL.X FP0,FP2 ...RS(P1+S(P2+SP3)) FMUL.X FP3,FP1 ...S(Q1+S(Q2+S(Q3+SQ4))) FADD.X FP2,FP0 ...R+RS(P1+S(P2+SP3)) FADD.S #:3F800000,FP1 ...1+S(Q1+...) FMOVE.L d1,fpcr ;restore users exceptions FDIV.X FP1,FP0 ;last inst - possible exception set bra t_frcinx NODD: FMOVE.X FP0,FP1 FMUL.X FP0,FP0 ...S = R*R FMOVE.D TANQ4,FP3 FMOVE.D TANP3,FP2 FMUL.X FP0,FP3 ...SQ4 FMUL.X FP0,FP2 ...SP3 FADD.D TANQ3,FP3 ...Q3+SQ4 FADD.X TANP2,FP2 ...P2+SP3 FMUL.X FP0,FP3 ...S(Q3+SQ4) FMUL.X FP0,FP2 ...S(P2+SP3) FADD.X TANQ2,FP3 ...Q2+S(Q3+SQ4) FADD.X TANP1,FP2 ...P1+S(P2+SP3) FMUL.X FP0,FP3 ...S(Q2+S(Q3+SQ4)) FMUL.X FP0,FP2 ...S(P1+S(P2+SP3)) FADD.X TANQ1,FP3 ...Q1+S(Q2+S(Q3+SQ4)) FMUL.X FP1,FP2 ...RS(P1+S(P2+SP3)) FMUL.X FP3,FP0 ...S(Q1+S(Q2+S(Q3+SQ4))) FADD.X FP2,FP1 ...R+RS(P1+S(P2+SP3)) FADD.S #:3F800000,FP0 ...1+S(Q1+...) FMOVE.X FP1,-(sp) EORI.L #$80000000,(sp) FMOVE.L d1,fpcr ;restore users exceptions FDIV.X (sp)+,FP0 ;last inst - possible exception set bra t_frcinx TANBORS: *--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION. *--IF |X| < 2**(-40), RETURN X OR 1. CMPI.L #$3FFF8000,D0 BGT.B REDUCEX TANSM: FMOVE.X FP0,-(sp) FMOVE.L d1,fpcr ;restore users exceptions FMOVE.X (sp)+,FP0 ;last inst - posibble exception set bra t_frcinx REDUCEX: *--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW. *--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING *--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE. FMOVEM.X FP2-FP5,-(A7) ...save FP2 through FP5 MOVE.L D2,-(A7) FMOVE.S #:00000000,FP1 *--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that *--there is a danger of unwanted overflow in first LOOP iteration. In this *--case, reduce argument by one remainder step to make subsequent reduction *--safe. cmpi.l #$7ffeffff,d0 ;is argument dangerously large? bne.b LOOP move.l #$7ffe0000,FP_SCR2(a6) ;yes * ;create 2**16383*PI/2 move.l #$c90fdaa2,FP_SCR2+4(a6) clr.l FP_SCR2+8(a6) ftst.x fp0 ;test sign of argument move.l #$7fdc0000,FP_SCR3(a6) ;create low half of 2**16383* * ;PI/2 at FP_SCR3 move.l #$85a308d3,FP_SCR3+4(a6) clr.l FP_SCR3+8(a6) fblt.w red_neg or.w #$8000,FP_SCR2(a6) ;positive arg or.w #$8000,FP_SCR3(a6) red_neg: fadd.x FP_SCR2(a6),fp0 ;high part of reduction is exact fmove.x fp0,fp1 ;save high result in fp1 fadd.x FP_SCR3(a6),fp0 ;low part of reduction fsub.x fp0,fp1 ;determine low component of result fadd.x FP_SCR3(a6),fp1 ;fp0/fp1 are reduced argument. *--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4. *--integer quotient will be stored in N *--Intermeditate remainder is 66-bit long; (R,r) in (FP0,FP1) LOOP: FMOVE.X FP0,INARG(a6) ...+-2**K * F, 1 <= F < 2 MOVE.W INARG(a6),D0 MOVE.L D0,A1 ...save a copy of D0 ANDI.L #$00007FFF,D0 SUBI.L #$00003FFF,D0 ...D0 IS K CMPI.L #28,D0 BLE.B LASTLOOP CONTLOOP: SUBI.L #27,D0 ...D0 IS L := K-27 CLR.L ENDFLAG(a6) BRA.B WORK LASTLOOP: CLR.L D0 ...D0 IS L := 0 MOVE.L #1,ENDFLAG(a6) WORK: *--FIND THE REMAINDER OF (R,r) W.R.T. 2**L * (PI/2). L IS SO CHOSEN *--THAT INT( X * (2/PI) / 2**(L) ) < 2**29. *--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63), *--2**L * (PIby2_1), 2**L * (PIby2_2) MOVE.L #$00003FFE,D2 ...BIASED EXPO OF 2/PI SUB.L D0,D2 ...BIASED EXPO OF 2**(-L)*(2/PI) MOVE.L #$A2F9836E,FP_SCR1+4(a6) MOVE.L #$4E44152A,FP_SCR1+8(a6) MOVE.W D2,FP_SCR1(a6) ...FP_SCR1 is 2**(-L)*(2/PI) FMOVE.X FP0,FP2 FMUL.X FP_SCR1(a6),FP2 *--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN *--FLOATING POINT FORMAT, THE TWO FMOVE'S FMOVE.L FP <--> N *--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT *--(SIGN(INARG)*2**63 + FP2) - SIGN(INARG)*2**63 WILL GIVE *--US THE DESIRED VALUE IN FLOATING POINT. *--HIDE SIX CYCLES OF INSTRUCTION MOVE.L A1,D2 SWAP D2 ANDI.L #$80000000,D2 ORI.L #$5F000000,D2 ...D2 IS SIGN(INARG)*2**63 IN SGL MOVE.L D2,TWOTO63(a6) MOVE.L D0,D2 ADDI.L #$00003FFF,D2 ...BIASED EXPO OF 2**L * (PI/2) *--FP2 IS READY FADD.S TWOTO63(a6),FP2 ...THE FRACTIONAL PART OF FP1 IS ROUNDED *--HIDE 4 CYCLES OF INSTRUCTION; creating 2**(L)*Piby2_1 and 2**(L)*Piby2_2 MOVE.W D2,FP_SCR2(a6) CLR.W FP_SCR2+2(a6) MOVE.L #$C90FDAA2,FP_SCR2+4(a6) CLR.L FP_SCR2+8(a6) ...FP_SCR2 is 2**(L) * Piby2_1 *--FP2 IS READY FSUB.S TWOTO63(a6),FP2 ...FP2 is N ADDI.L #$00003FDD,D0 MOVE.W D0,FP_SCR3(a6) CLR.W FP_SCR3+2(a6) MOVE.L #$85A308D3,FP_SCR3+4(a6) CLR.L FP_SCR3+8(a6) ...FP_SCR3 is 2**(L) * Piby2_2 MOVE.L ENDFLAG(a6),D0 *--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and *--P2 = 2**(L) * Piby2_2 FMOVE.X FP2,FP4 FMul.X FP_SCR2(a6),FP4 ...W = N*P1 FMove.X FP2,FP5 FMul.X FP_SCR3(a6),FP5 ...w = N*P2 FMove.X FP4,FP3 *--we want P+p = W+w but |p| <= half ulp of P *--Then, we need to compute A := R-P and a := r-p FAdd.X FP5,FP3 ...FP3 is P FSub.X FP3,FP4 ...W-P FSub.X FP3,FP0 ...FP0 is A := R - P FAdd.X FP5,FP4 ...FP4 is p = (W-P)+w FMove.X FP0,FP3 ...FP3 A FSub.X FP4,FP1 ...FP1 is a := r - p *--Now we need to normalize (A,a) to "new (R,r)" where R+r = A+a but *--|r| <= half ulp of R. FAdd.X FP1,FP0 ...FP0 is R := A+a *--No need to calculate r if this is the last loop TST.L D0 BGT.W RESTORE *--Need to calculate r FSub.X FP0,FP3 ...A-R FAdd.X FP3,FP1 ...FP1 is r := (A-R)+a BRA.W LOOP RESTORE: FMOVE.L FP2,N(a6) MOVE.L (A7)+,D2 FMOVEM.X (A7)+,FP2-FP5 MOVE.L N(a6),D0 ROR.L #1,D0 BRA.W TANCONT end