jidctfst.cpp - DOOM-3-BFG - DOOM 3 BFG Edition

jidctfst.cpp (14346B)
      1 /*
      2  * jidctfst.c
      3  *
      4  * Copyright (C) 1994-1995, Thomas G. Lane.
      5  * This file is part of the Independent JPEG Group's software.
      6  * For conditions of distribution and use, see the accompanying README file.
      7  *
      8  * This file contains a fast, not so accurate integer implementation of the
      9  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
     10  * must also perform dequantization of the input coefficients.
     11  *
     12  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
     13  * on each row (or vice versa, but it's more convenient to emit a row at
     14  * a time).  Direct algorithms are also available, but they are much more
     15  * complex and seem not to be any faster when reduced to code.
     16  *
     17  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
     18  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
     19  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
     20  * JPEG textbook (see REFERENCES section in file README).  The following code
     21  * is based directly on figure 4-8 in P&M.
     22  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
     23  * possible to arrange the computation so that many of the multiplies are
     24  * simple scalings of the final outputs.  These multiplies can then be
     25  * folded into the multiplications or divisions by the JPEG quantization
     26  * table entries.  The AA&N method leaves only 5 multiplies and 29 adds
     27  * to be done in the DCT itself.
     28  * The primary disadvantage of this method is that with fixed-point math,
     29  * accuracy is lost due to imprecise representation of the scaled
     30  * quantization values.  The smaller the quantization table entry, the less
     31  * precise the scaled value, so this implementation does worse with high-
     32  * quality-setting files than with low-quality ones.
     33  */
     34 
     35 #define JPEG_INTERNALS
     36 #include "jinclude.h"
     37 #include "jpeglib.h"
     38 #include "jdct.h"        /* Private declarations for DCT subsystem */
     39 
     40 #ifdef DCT_IFAST_SUPPORTED
     41 
     42 
     43 /*
     44  * This module is specialized to the case DCTSIZE = 8.
     45  */
     46 
     47 #if DCTSIZE != 8
     48 Sorry, this code only copes with 8 x8 DCTs.  /* deliberate syntax err */
     49     #endif
     50 
     51 
     52 /* Scaling decisions are generally the same as in the LL&M algorithm;
     53  * see jidctint.c for more details.  However, we choose to descale
     54  * (right shift) multiplication products as soon as they are formed,
     55  * rather than carrying additional fractional bits into subsequent additions.
     56  * This compromises accuracy slightly, but it lets us save a few shifts.
     57  * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
     58  * everywhere except in the multiplications proper; this saves a good deal
     59  * of work on 16-bit-int machines.
     60  *
     61  * The dequantized coefficients are not integers because the AA&N scaling
     62  * factors have been incorporated.  We represent them scaled up by PASS1_BITS,
     63  * so that the first and second IDCT rounds have the same input scaling.
     64  * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to
     65  * avoid a descaling shift; this compromises accuracy rather drastically
     66  * for small quantization table entries, but it saves a lot of shifts.
     67  * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway,
     68  * so we use a much larger scaling factor to preserve accuracy.
     69  *
     70  * A final compromise is to represent the multiplicative constants to only
     71  * 8 fractional bits, rather than 13.  This saves some shifting work on some
     72  * machines, and may also reduce the cost of multiplication (since there
     73  * are fewer one-bits in the constants).
     74  */
     75 
     76 #if BITS_IN_JSAMPLE == 8
     77 #define CONST_BITS  8
     78 #define PASS1_BITS  2
     79 #else
     80 #define CONST_BITS  8
     81 #define PASS1_BITS  1       /* lose a little precision to avoid overflow */
     82 #endif
     83 
     84 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
     85  * causing a lot of useless floating-point operations at run time.
     86  * To get around this we use the following pre-calculated constants.
     87  * If you change CONST_BITS you may want to add appropriate values.
     88  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
     89  */
     90 
     91 #if CONST_BITS == 8
     92 #define FIX_1_082392200  ( (INT32)  277 )     /* FIX(1.082392200) */
     93 #define FIX_1_414213562  ( (INT32)  362 )     /* FIX(1.414213562) */
     94 #define FIX_1_847759065  ( (INT32)  473 )     /* FIX(1.847759065) */
     95 #define FIX_2_613125930  ( (INT32)  669 )     /* FIX(2.613125930) */
     96 #else
     97 #define FIX_1_082392200  FIX( 1.082392200 )
     98 #define FIX_1_414213562  FIX( 1.414213562 )
     99 #define FIX_1_847759065  FIX( 1.847759065 )
    100 #define FIX_2_613125930  FIX( 2.613125930 )
    101 #endif
    102 
    103 
    104 /* We can gain a little more speed, with a further compromise in accuracy,
    105  * by omitting the addition in a descaling shift.  This yields an incorrectly
    106  * rounded result half the time...
    107  */
    108 
    109 #ifndef USE_ACCURATE_ROUNDING
    110 #undef DESCALE
    111 #define DESCALE( x, n )  RIGHT_SHIFT( x, n )
    112 #endif
    113 
    114 
    115 /* Multiply a DCTELEM variable by an INT32 constant, and immediately
    116  * descale to yield a DCTELEM result.
    117  */
    118 
    119 #define MULTIPLY( var, const )  ( (DCTELEM) DESCALE( ( var ) * ( const ), CONST_BITS ) )
    120 
    121 
    122 /* Dequantize a coefficient by multiplying it by the multiplier-table
    123  * entry; produce a DCTELEM result.  For 8-bit data a 16x16->16
    124  * multiplication will do.  For 12-bit data, the multiplier table is
    125  * declared INT32, so a 32-bit multiply will be used.
    126  */
    127 
    128 #if BITS_IN_JSAMPLE == 8
    129 #define DEQUANTIZE( coef, quantval )  ( ( (IFAST_MULT_TYPE) ( coef ) ) * ( quantval ) )
    130 #else
    131 #define DEQUANTIZE( coef, quantval )  \
    132     DESCALE( ( coef ) * ( quantval ), IFAST_SCALE_BITS - PASS1_BITS )
    133 #endif
    134 
    135 
    136 /* Like DESCALE, but applies to a DCTELEM and produces an int.
    137  * We assume that int right shift is unsigned if INT32 right shift is.
    138  */
    139 
    140 #ifdef RIGHT_SHIFT_IS_UNSIGNED
    141 #define ISHIFT_TEMPS    DCTELEM ishift_temp;
    142 #if BITS_IN_JSAMPLE == 8
    143 #define DCTELEMBITS  16     /* DCTELEM may be 16 or 32 bits */
    144 #else
    145 #define DCTELEMBITS  32     /* DCTELEM must be 32 bits */
    146 #endif
    147 #define IRIGHT_SHIFT( x, shft )  \
    148     ( ( ishift_temp = ( x ) ) < 0 ? \
    149                       ( ishift_temp >> ( shft ) ) | ( ( ~( (DCTELEM) 0 ) ) << ( DCTELEMBITS - ( shft ) ) ) : \
    150                       ( ishift_temp >> ( shft ) ) )
    151 #else
    152 #define ISHIFT_TEMPS
    153 #define IRIGHT_SHIFT( x, shft )    ( ( x ) >> ( shft ) )
    154 #endif
    155 
    156 #ifdef USE_ACCURATE_ROUNDING
    157 #define IDESCALE( x, n )  ( (int) IRIGHT_SHIFT( ( x ) + ( 1 << ( ( n ) - 1 ) ), n ) )
    158 #else
    159 #define IDESCALE( x, n )  ( (int) IRIGHT_SHIFT( x, n ) )
    160 #endif
    161 
    162 
    163 /*
    164  * Perform dequantization and inverse DCT on one block of coefficients.
    165  */
    166 
    167 GLOBAL void
    168 jpeg_idct_ifast( j_decompress_ptr cinfo, jpeg_component_info * compptr,
    169                  JCOEFPTR coef_block,
    170                  JSAMPARRAY output_buf, JDIMENSION output_col ) {
    171     DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    172     DCTELEM tmp10, tmp11, tmp12, tmp13;
    173     DCTELEM z5, z10, z11, z12, z13;
    174     JCOEFPTR inptr;
    175     IFAST_MULT_TYPE * quantptr;
    176     int * wsptr;
    177     JSAMPROW outptr;
    178     JSAMPLE * range_limit = IDCT_range_limit( cinfo );
    179     int ctr;
    180     int workspace[DCTSIZE2];/* buffers data between passes */
    181     SHIFT_TEMPS         /* for DESCALE */
    182     ISHIFT_TEMPS        /* for IDESCALE */
    183 
    184     /* Pass 1: process columns from input, store into work array. */
    185 
    186     inptr = coef_block;
    187     quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
    188     wsptr = workspace;
    189     for ( ctr = DCTSIZE; ctr > 0; ctr-- ) {
    190         /* Due to quantization, we will usually find that many of the input
    191          * coefficients are zero, especially the AC terms.  We can exploit this
    192          * by short-circuiting the IDCT calculation for any column in which all
    193          * the AC terms are zero.  In that case each output is equal to the
    194          * DC coefficient (with scale factor as needed).
    195          * With typical images and quantization tables, half or more of the
    196          * column DCT calculations can be simplified this way.
    197          */
    198 
    199         if ( ( inptr[DCTSIZE * 1] | inptr[DCTSIZE * 2] | inptr[DCTSIZE * 3] |
    200                inptr[DCTSIZE * 4] | inptr[DCTSIZE * 5] | inptr[DCTSIZE * 6] |
    201                inptr[DCTSIZE * 7] ) == 0 ) {
    202             /* AC terms all zero */
    203             int dcval = (int) DEQUANTIZE( inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0] );
    204 
    205             wsptr[DCTSIZE * 0] = dcval;
    206             wsptr[DCTSIZE * 1] = dcval;
    207             wsptr[DCTSIZE * 2] = dcval;
    208             wsptr[DCTSIZE * 3] = dcval;
    209             wsptr[DCTSIZE * 4] = dcval;
    210             wsptr[DCTSIZE * 5] = dcval;
    211             wsptr[DCTSIZE * 6] = dcval;
    212             wsptr[DCTSIZE * 7] = dcval;
    213 
    214             inptr++;    /* advance pointers to next column */
    215             quantptr++;
    216             wsptr++;
    217             continue;
    218         }
    219 
    220         /* Even part */
    221 
    222         tmp0 = DEQUANTIZE( inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0] );
    223         tmp1 = DEQUANTIZE( inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2] );
    224         tmp2 = DEQUANTIZE( inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4] );
    225         tmp3 = DEQUANTIZE( inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6] );
    226 
    227         tmp10 = tmp0 + tmp2;/* phase 3 */
    228         tmp11 = tmp0 - tmp2;
    229 
    230         tmp13 = tmp1 + tmp3;/* phases 5-3 */
    231         tmp12 = MULTIPLY( tmp1 - tmp3, FIX_1_414213562 ) - tmp13;/* 2*c4 */
    232 
    233         tmp0 = tmp10 + tmp13;/* phase 2 */
    234         tmp3 = tmp10 - tmp13;
    235         tmp1 = tmp11 + tmp12;
    236         tmp2 = tmp11 - tmp12;
    237 
    238         /* Odd part */
    239 
    240         tmp4 = DEQUANTIZE( inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1] );
    241         tmp5 = DEQUANTIZE( inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3] );
    242         tmp6 = DEQUANTIZE( inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5] );
    243         tmp7 = DEQUANTIZE( inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7] );
    244 
    245         z13 = tmp6 + tmp5;  /* phase 6 */
    246         z10 = tmp6 - tmp5;
    247         z11 = tmp4 + tmp7;
    248         z12 = tmp4 - tmp7;
    249 
    250         tmp7 = z11 + z13;   /* phase 5 */
    251         tmp11 = MULTIPLY( z11 - z13, FIX_1_414213562 );/* 2*c4 */
    252 
    253         z5 = MULTIPLY( z10 + z12, FIX_1_847759065 );/* 2*c2 */
    254         tmp10 = MULTIPLY( z12, FIX_1_082392200 ) - z5;/* 2*(c2-c6) */
    255         tmp12 = MULTIPLY( z10, -FIX_2_613125930 ) + z5;/* -2*(c2+c6) */
    256 
    257         tmp6 = tmp12 - tmp7;/* phase 2 */
    258         tmp5 = tmp11 - tmp6;
    259         tmp4 = tmp10 + tmp5;
    260 
    261         wsptr[DCTSIZE * 0] = (int) ( tmp0 + tmp7 );
    262         wsptr[DCTSIZE * 7] = (int) ( tmp0 - tmp7 );
    263         wsptr[DCTSIZE * 1] = (int) ( tmp1 + tmp6 );
    264         wsptr[DCTSIZE * 6] = (int) ( tmp1 - tmp6 );
    265         wsptr[DCTSIZE * 2] = (int) ( tmp2 + tmp5 );
    266         wsptr[DCTSIZE * 5] = (int) ( tmp2 - tmp5 );
    267         wsptr[DCTSIZE * 4] = (int) ( tmp3 + tmp4 );
    268         wsptr[DCTSIZE * 3] = (int) ( tmp3 - tmp4 );
    269 
    270         inptr++;        /* advance pointers to next column */
    271         quantptr++;
    272         wsptr++;
    273     }
    274 
    275     /* Pass 2: process rows from work array, store into output array. */
    276     /* Note that we must descale the results by a factor of 8 == 2**3, */
    277     /* and also undo the PASS1_BITS scaling. */
    278 
    279     wsptr = workspace;
    280     for ( ctr = 0; ctr < DCTSIZE; ctr++ ) {
    281         outptr = output_buf[ctr] + output_col;
    282         /* Rows of zeroes can be exploited in the same way as we did with columns.
    283          * However, the column calculation has created many nonzero AC terms, so
    284          * the simplification applies less often (typically 5% to 10% of the time).
    285          * On machines with very fast multiplication, it's possible that the
    286          * test takes more time than it's worth.  In that case this section
    287          * may be commented out.
    288          */
    289 
    290 #ifndef NO_ZERO_ROW_TEST
    291         if ( ( wsptr[1] | wsptr[2] | wsptr[3] | wsptr[4] | wsptr[5] | wsptr[6] |
    292                wsptr[7] ) == 0 ) {
    293             /* AC terms all zero */
    294             JSAMPLE dcval = range_limit[IDESCALE( wsptr[0], PASS1_BITS + 3 )
    295                                         & RANGE_MASK];
    296 
    297             outptr[0] = dcval;
    298             outptr[1] = dcval;
    299             outptr[2] = dcval;
    300             outptr[3] = dcval;
    301             outptr[4] = dcval;
    302             outptr[5] = dcval;
    303             outptr[6] = dcval;
    304             outptr[7] = dcval;
    305 
    306             wsptr += DCTSIZE;/* advance pointer to next row */
    307             continue;
    308         }
    309 #endif
    310 
    311         /* Even part */
    312 
    313         tmp10 = ( (DCTELEM) wsptr[0] + (DCTELEM) wsptr[4] );
    314         tmp11 = ( (DCTELEM) wsptr[0] - (DCTELEM) wsptr[4] );
    315 
    316         tmp13 = ( (DCTELEM) wsptr[2] + (DCTELEM) wsptr[6] );
    317         tmp12 = MULTIPLY( (DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562 )
    318                 - tmp13;
    319 
    320         tmp0 = tmp10 + tmp13;
    321         tmp3 = tmp10 - tmp13;
    322         tmp1 = tmp11 + tmp12;
    323         tmp2 = tmp11 - tmp12;
    324 
    325         /* Odd part */
    326 
    327         z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
    328         z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
    329         z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
    330         z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
    331 
    332         tmp7 = z11 + z13;   /* phase 5 */
    333         tmp11 = MULTIPLY( z11 - z13, FIX_1_414213562 );/* 2*c4 */
    334 
    335         z5 = MULTIPLY( z10 + z12, FIX_1_847759065 );/* 2*c2 */
    336         tmp10 = MULTIPLY( z12, FIX_1_082392200 ) - z5;/* 2*(c2-c6) */
    337         tmp12 = MULTIPLY( z10, -FIX_2_613125930 ) + z5;/* -2*(c2+c6) */
    338 
    339         tmp6 = tmp12 - tmp7;/* phase 2 */
    340         tmp5 = tmp11 - tmp6;
    341         tmp4 = tmp10 + tmp5;
    342 
    343         /* Final output stage: scale down by a factor of 8 and range-limit */
    344 
    345         outptr[0] = range_limit[IDESCALE( tmp0 + tmp7, PASS1_BITS + 3 )
    346                                 & RANGE_MASK];
    347         outptr[7] = range_limit[IDESCALE( tmp0 - tmp7, PASS1_BITS + 3 )
    348                                 & RANGE_MASK];
    349         outptr[1] = range_limit[IDESCALE( tmp1 + tmp6, PASS1_BITS + 3 )
    350                                 & RANGE_MASK];
    351         outptr[6] = range_limit[IDESCALE( tmp1 - tmp6, PASS1_BITS + 3 )
    352                                 & RANGE_MASK];
    353         outptr[2] = range_limit[IDESCALE( tmp2 + tmp5, PASS1_BITS + 3 )
    354                                 & RANGE_MASK];
    355         outptr[5] = range_limit[IDESCALE( tmp2 - tmp5, PASS1_BITS + 3 )
    356                                 & RANGE_MASK];
    357         outptr[4] = range_limit[IDESCALE( tmp3 + tmp4, PASS1_BITS + 3 )
    358                                 & RANGE_MASK];
    359         outptr[3] = range_limit[IDESCALE( tmp3 - tmp4, PASS1_BITS + 3 )
    360                                 & RANGE_MASK];
    361 
    362         wsptr += DCTSIZE;   /* advance pointer to next row */
    363     }
    364 }
    365 
    366 #endif /* DCT_IFAST_SUPPORTED */