jidctflt.cpp - DOOM-3-BFG - DOOM 3 BFG Edition

jidctflt.cpp (9308B)
      1 /*
      2  * jidctflt.c
      3  *
      4  * Copyright (C) 1994, Thomas G. Lane.
      5  * This file is part of the Independent JPEG Group's software.
      6  * For conditions of distribution and use, see the accompanying README file.
      7  *
      8  * This file contains a floating-point implementation of the
      9  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
     10  * must also perform dequantization of the input coefficients.
     11  *
     12  * This implementation should be more accurate than either of the integer
     13  * IDCT implementations.  However, it may not give the same results on all
     14  * machines because of differences in roundoff behavior.  Speed will depend
     15  * on the hardware's floating point capacity.
     16  *
     17  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
     18  * on each row (or vice versa, but it's more convenient to emit a row at
     19  * a time).  Direct algorithms are also available, but they are much more
     20  * complex and seem not to be any faster when reduced to code.
     21  *
     22  * This implementation is based on Arai, Agui, and Nakajima's algorithm for
     23  * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
     24  * Japanese, but the algorithm is described in the Pennebaker & Mitchell
     25  * JPEG textbook (see REFERENCES section in file README).  The following code
     26  * is based directly on figure 4-8 in P&M.
     27  * While an 8-point DCT cannot be done in less than 11 multiplies, it is
     28  * possible to arrange the computation so that many of the multiplies are
     29  * simple scalings of the final outputs.  These multiplies can then be
     30  * folded into the multiplications or divisions by the JPEG quantization
     31  * table entries.  The AA&N method leaves only 5 multiplies and 29 adds
     32  * to be done in the DCT itself.
     33  * The primary disadvantage of this method is that with a fixed-point
     34  * implementation, accuracy is lost due to imprecise representation of the
     35  * scaled quantization values.  However, that problem does not arise if
     36  * we use floating point arithmetic.
     37  */
     38 
     39 #define JPEG_INTERNALS
     40 #include "jinclude.h"
     41 #include "jpeglib.h"
     42 #include "jdct.h"        /* Private declarations for DCT subsystem */
     43 
     44 #ifdef DCT_FLOAT_SUPPORTED
     45 
     46 
     47 /*
     48  * This module is specialized to the case DCTSIZE = 8.
     49  */
     50 
     51 #if DCTSIZE != 8
     52 Sorry, this code only copes with 8 x8 DCTs.  /* deliberate syntax err */
     53     #endif
     54 
     55 
     56 /* Dequantize a coefficient by multiplying it by the multiplier-table
     57  * entry; produce a float result.
     58  */
     59 
     60 #define DEQUANTIZE( coef, quantval )  ( ( (FAST_FLOAT) ( coef ) ) * ( quantval ) )
     61 
     62 
     63 /*
     64  * Perform dequantization and inverse DCT on one block of coefficients.
     65  */
     66 
     67 GLOBAL void
     68 jpeg_idct_float( j_decompress_ptr cinfo, jpeg_component_info * compptr,
     69                  JCOEFPTR coef_block,
     70                  JSAMPARRAY output_buf, JDIMENSION output_col ) {
     71     FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     72     FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
     73     FAST_FLOAT z5, z10, z11, z12, z13;
     74     JCOEFPTR inptr;
     75     FLOAT_MULT_TYPE * quantptr;
     76     FAST_FLOAT * wsptr;
     77     JSAMPROW outptr;
     78     JSAMPLE * range_limit = IDCT_range_limit( cinfo );
     79     int ctr;
     80     FAST_FLOAT workspace[DCTSIZE2];/* buffers data between passes */
     81     SHIFT_TEMPS
     82 
     83     /* Pass 1: process columns from input, store into work array. */
     84 
     85     inptr = coef_block;
     86     quantptr = (FLOAT_MULT_TYPE *) compptr->dct_table;
     87     wsptr = workspace;
     88     for ( ctr = DCTSIZE; ctr > 0; ctr-- ) {
     89         /* Due to quantization, we will usually find that many of the input
     90          * coefficients are zero, especially the AC terms.  We can exploit this
     91          * by short-circuiting the IDCT calculation for any column in which all
     92          * the AC terms are zero.  In that case each output is equal to the
     93          * DC coefficient (with scale factor as needed).
     94          * With typical images and quantization tables, half or more of the
     95          * column DCT calculations can be simplified this way.
     96          */
     97 
     98         if ( ( inptr[DCTSIZE * 1] | inptr[DCTSIZE * 2] | inptr[DCTSIZE * 3] |
     99                inptr[DCTSIZE * 4] | inptr[DCTSIZE * 5] | inptr[DCTSIZE * 6] |
    100                inptr[DCTSIZE * 7] ) == 0 ) {
    101             /* AC terms all zero */
    102             FAST_FLOAT dcval = DEQUANTIZE( inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0] );
    103 
    104             wsptr[DCTSIZE * 0] = dcval;
    105             wsptr[DCTSIZE * 1] = dcval;
    106             wsptr[DCTSIZE * 2] = dcval;
    107             wsptr[DCTSIZE * 3] = dcval;
    108             wsptr[DCTSIZE * 4] = dcval;
    109             wsptr[DCTSIZE * 5] = dcval;
    110             wsptr[DCTSIZE * 6] = dcval;
    111             wsptr[DCTSIZE * 7] = dcval;
    112 
    113             inptr++;    /* advance pointers to next column */
    114             quantptr++;
    115             wsptr++;
    116             continue;
    117         }
    118 
    119         /* Even part */
    120 
    121         tmp0 = DEQUANTIZE( inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0] );
    122         tmp1 = DEQUANTIZE( inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2] );
    123         tmp2 = DEQUANTIZE( inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4] );
    124         tmp3 = DEQUANTIZE( inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6] );
    125 
    126         tmp10 = tmp0 + tmp2;/* phase 3 */
    127         tmp11 = tmp0 - tmp2;
    128 
    129         tmp13 = tmp1 + tmp3;/* phases 5-3 */
    130         tmp12 = ( tmp1 - tmp3 ) * ( (FAST_FLOAT) 1.414213562 ) - tmp13;/* 2*c4 */
    131 
    132         tmp0 = tmp10 + tmp13;/* phase 2 */
    133         tmp3 = tmp10 - tmp13;
    134         tmp1 = tmp11 + tmp12;
    135         tmp2 = tmp11 - tmp12;
    136 
    137         /* Odd part */
    138 
    139         tmp4 = DEQUANTIZE( inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1] );
    140         tmp5 = DEQUANTIZE( inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3] );
    141         tmp6 = DEQUANTIZE( inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5] );
    142         tmp7 = DEQUANTIZE( inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7] );
    143 
    144         z13 = tmp6 + tmp5;  /* phase 6 */
    145         z10 = tmp6 - tmp5;
    146         z11 = tmp4 + tmp7;
    147         z12 = tmp4 - tmp7;
    148 
    149         tmp7 = z11 + z13;   /* phase 5 */
    150         tmp11 = ( z11 - z13 ) * ( (FAST_FLOAT) 1.414213562 );/* 2*c4 */
    151 
    152         z5 = ( z10 + z12 ) * ( (FAST_FLOAT) 1.847759065 );/* 2*c2 */
    153         tmp10 = ( (FAST_FLOAT) 1.082392200 ) * z12 - z5;/* 2*(c2-c6) */
    154         tmp12 = ( (FAST_FLOAT) -2.613125930 ) * z10 + z5;/* -2*(c2+c6) */
    155 
    156         tmp6 = tmp12 - tmp7;/* phase 2 */
    157         tmp5 = tmp11 - tmp6;
    158         tmp4 = tmp10 + tmp5;
    159 
    160         wsptr[DCTSIZE * 0] = tmp0 + tmp7;
    161         wsptr[DCTSIZE * 7] = tmp0 - tmp7;
    162         wsptr[DCTSIZE * 1] = tmp1 + tmp6;
    163         wsptr[DCTSIZE * 6] = tmp1 - tmp6;
    164         wsptr[DCTSIZE * 2] = tmp2 + tmp5;
    165         wsptr[DCTSIZE * 5] = tmp2 - tmp5;
    166         wsptr[DCTSIZE * 4] = tmp3 + tmp4;
    167         wsptr[DCTSIZE * 3] = tmp3 - tmp4;
    168 
    169         inptr++;        /* advance pointers to next column */
    170         quantptr++;
    171         wsptr++;
    172     }
    173 
    174     /* Pass 2: process rows from work array, store into output array. */
    175     /* Note that we must descale the results by a factor of 8 == 2**3. */
    176 
    177     wsptr = workspace;
    178     for ( ctr = 0; ctr < DCTSIZE; ctr++ ) {
    179         outptr = output_buf[ctr] + output_col;
    180         /* Rows of zeroes can be exploited in the same way as we did with columns.
    181          * However, the column calculation has created many nonzero AC terms, so
    182          * the simplification applies less often (typically 5% to 10% of the time).
    183          * And testing floats for zero is relatively expensive, so we don't bother.
    184          */
    185 
    186         /* Even part */
    187 
    188         tmp10 = wsptr[0] + wsptr[4];
    189         tmp11 = wsptr[0] - wsptr[4];
    190 
    191         tmp13 = wsptr[2] + wsptr[6];
    192         tmp12 = ( wsptr[2] - wsptr[6] ) * ( (FAST_FLOAT) 1.414213562 ) - tmp13;
    193 
    194         tmp0 = tmp10 + tmp13;
    195         tmp3 = tmp10 - tmp13;
    196         tmp1 = tmp11 + tmp12;
    197         tmp2 = tmp11 - tmp12;
    198 
    199         /* Odd part */
    200 
    201         z13 = wsptr[5] + wsptr[3];
    202         z10 = wsptr[5] - wsptr[3];
    203         z11 = wsptr[1] + wsptr[7];
    204         z12 = wsptr[1] - wsptr[7];
    205 
    206         tmp7 = z11 + z13;
    207         tmp11 = ( z11 - z13 ) * ( (FAST_FLOAT) 1.414213562 );
    208 
    209         z5 = ( z10 + z12 ) * ( (FAST_FLOAT) 1.847759065 );/* 2*c2 */
    210         tmp10 = ( (FAST_FLOAT) 1.082392200 ) * z12 - z5;/* 2*(c2-c6) */
    211         tmp12 = ( (FAST_FLOAT) -2.613125930 ) * z10 + z5;/* -2*(c2+c6) */
    212 
    213         tmp6 = tmp12 - tmp7;
    214         tmp5 = tmp11 - tmp6;
    215         tmp4 = tmp10 + tmp5;
    216 
    217         /* Final output stage: scale down by a factor of 8 and range-limit */
    218 
    219         outptr[0] = range_limit[(int) DESCALE( (INT32) ( tmp0 + tmp7 ), 3 )
    220                                 & RANGE_MASK];
    221         outptr[7] = range_limit[(int) DESCALE( (INT32) ( tmp0 - tmp7 ), 3 )
    222                                 & RANGE_MASK];
    223         outptr[1] = range_limit[(int) DESCALE( (INT32) ( tmp1 + tmp6 ), 3 )
    224                                 & RANGE_MASK];
    225         outptr[6] = range_limit[(int) DESCALE( (INT32) ( tmp1 - tmp6 ), 3 )
    226                                 & RANGE_MASK];
    227         outptr[2] = range_limit[(int) DESCALE( (INT32) ( tmp2 + tmp5 ), 3 )
    228                                 & RANGE_MASK];
    229         outptr[5] = range_limit[(int) DESCALE( (INT32) ( tmp2 - tmp5 ), 3 )
    230                                 & RANGE_MASK];
    231         outptr[4] = range_limit[(int) DESCALE( (INT32) ( tmp3 + tmp4 ), 3 )
    232                                 & RANGE_MASK];
    233         outptr[3] = range_limit[(int) DESCALE( (INT32) ( tmp3 - tmp4 ), 3 )
    234                                 & RANGE_MASK];
    235 
    236         wsptr += DCTSIZE;   /* advance pointer to next row */
    237     }
    238 }
    239 
    240 #endif /* DCT_FLOAT_SUPPORTED */