ReactOS  0.4.14-dev-342-gdc047f9
jfdctint.c
Go to the documentation of this file.
1 /*
2  * jfdctint.c
3  *
4  * Copyright (C) 1991-1996, Thomas G. Lane.
5  * Modification developed 2003-2015 by Guido Vollbeding.
6  * This file is part of the Independent JPEG Group's software.
7  * For conditions of distribution and use, see the accompanying README file.
8  *
9  * This file contains a slow-but-accurate integer implementation of the
10  * forward DCT (Discrete Cosine Transform).
11  *
12  * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
13  * on each column. Direct algorithms are also available, but they are
14  * much more complex and seem not to be any faster when reduced to code.
15  *
16  * This implementation is based on an algorithm described in
17  * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
18  * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
19  * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
20  * The primary algorithm described there uses 11 multiplies and 29 adds.
21  * We use their alternate method with 12 multiplies and 32 adds.
22  * The advantage of this method is that no data path contains more than one
23  * multiplication; this allows a very simple and accurate implementation in
24  * scaled fixed-point arithmetic, with a minimal number of shifts.
25  *
26  * We also provide FDCT routines with various input sample block sizes for
27  * direct resolution reduction or enlargement and for direct resolving the
28  * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
29  * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
30  *
31  * For N<8 we fill the remaining block coefficients with zero.
32  * For N>8 we apply a partial N-point FDCT on the input samples, computing
33  * just the lower 8 frequency coefficients and discarding the rest.
34  *
35  * We must scale the output coefficients of the N-point FDCT appropriately
36  * to the standard 8-point FDCT level by 8/N per 1-D pass. This scaling
37  * is folded into the constant multipliers (pass 2) and/or final/initial
38  * shifting.
39  *
40  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
41  * since there would be too many additional constants to pre-calculate.
42  */
43 
44 #define JPEG_INTERNALS
45 #include "jinclude.h"
46 #include "jpeglib.h"
47 #include "jdct.h" /* Private declarations for DCT subsystem */
48 
49 #ifdef DCT_ISLOW_SUPPORTED
50 
51 
52 /*
53  * This module is specialized to the case DCTSIZE = 8.
54  */
55 
56 #if DCTSIZE != 8
57  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
58 #endif
59 
60 
61 /*
62  * The poop on this scaling stuff is as follows:
63  *
64  * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
65  * larger than the true DCT outputs. The final outputs are therefore
66  * a factor of N larger than desired; since N=8 this can be cured by
67  * a simple right shift at the end of the algorithm. The advantage of
68  * this arrangement is that we save two multiplications per 1-D DCT,
69  * because the y0 and y4 outputs need not be divided by sqrt(N).
70  * In the IJG code, this factor of 8 is removed by the quantization step
71  * (in jcdctmgr.c), NOT in this module.
72  *
73  * We have to do addition and subtraction of the integer inputs, which
74  * is no problem, and multiplication by fractional constants, which is
75  * a problem to do in integer arithmetic. We multiply all the constants
76  * by CONST_SCALE and convert them to integer constants (thus retaining
77  * CONST_BITS bits of precision in the constants). After doing a
78  * multiplication we have to divide the product by CONST_SCALE, with proper
79  * rounding, to produce the correct output. This division can be done
80  * cheaply as a right shift of CONST_BITS bits. We postpone shifting
81  * as long as possible so that partial sums can be added together with
82  * full fractional precision.
83  *
84  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
85  * they are represented to better-than-integral precision. These outputs
86  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
87  * with the recommended scaling. (For 12-bit sample data, the intermediate
88  * array is INT32 anyway.)
89  *
90  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
91  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
92  * shows that the values given below are the most effective.
93  */
94 
95 #if BITS_IN_JSAMPLE == 8
96 #define CONST_BITS 13
97 #define PASS1_BITS 2
98 #else
99 #define CONST_BITS 13
100 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
101 #endif
102 
103 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
104  * causing a lot of useless floating-point operations at run time.
105  * To get around this we use the following pre-calculated constants.
106  * If you change CONST_BITS you may want to add appropriate values.
107  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
108  */
109 
110 #if CONST_BITS == 13
111 #define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
112 #define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
113 #define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
114 #define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
115 #define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
116 #define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
117 #define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
118 #define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
119 #define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
120 #define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
121 #define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
122 #define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
123 #else
124 #define FIX_0_298631336 FIX(0.298631336)
125 #define FIX_0_390180644 FIX(0.390180644)
126 #define FIX_0_541196100 FIX(0.541196100)
127 #define FIX_0_765366865 FIX(0.765366865)
128 #define FIX_0_899976223 FIX(0.899976223)
129 #define FIX_1_175875602 FIX(1.175875602)
130 #define FIX_1_501321110 FIX(1.501321110)
131 #define FIX_1_847759065 FIX(1.847759065)
132 #define FIX_1_961570560 FIX(1.961570560)
133 #define FIX_2_053119869 FIX(2.053119869)
134 #define FIX_2_562915447 FIX(2.562915447)
135 #define FIX_3_072711026 FIX(3.072711026)
136 #endif
137 
138 
139 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
140  * For 8-bit samples with the recommended scaling, all the variable
141  * and constant values involved are no more than 16 bits wide, so a
142  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
143  * For 12-bit samples, a full 32-bit multiplication will be needed.
144  */
145 
146 #if BITS_IN_JSAMPLE == 8
147 #define MULTIPLY(var,const) MULTIPLY16C16(var,const)
148 #else
149 #define MULTIPLY(var,const) ((var) * (const))
150 #endif
151 
152 
153 /*
154  * Perform the forward DCT on one block of samples.
155  */
156 
157 GLOBAL(void)
159 {
160  INT32 tmp0, tmp1, tmp2, tmp3;
161  INT32 tmp10, tmp11, tmp12, tmp13;
162  INT32 z1;
163  DCTELEM *dataptr;
164  JSAMPROW elemptr;
165  int ctr;
167 
168  /* Pass 1: process rows.
169  * Note results are scaled up by sqrt(8) compared to a true DCT;
170  * furthermore, we scale the results by 2**PASS1_BITS.
171  * cK represents sqrt(2) * cos(K*pi/16).
172  */
173 
174  dataptr = data;
175  for (ctr = 0; ctr < DCTSIZE; ctr++) {
176  elemptr = sample_data[ctr] + start_col;
177 
178  /* Even part per LL&M figure 1 --- note that published figure is faulty;
179  * rotator "c1" should be "c6".
180  */
181 
182  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
183  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
184  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
185  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
186 
187  tmp10 = tmp0 + tmp3;
188  tmp12 = tmp0 - tmp3;
189  tmp11 = tmp1 + tmp2;
190  tmp13 = tmp1 - tmp2;
191 
192  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
193  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
194  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
195  tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
196 
197  /* Apply unsigned->signed conversion. */
198  dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
199  dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
200 
201  z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
202  /* Add fudge factor here for final descale. */
203  z1 += ONE << (CONST_BITS-PASS1_BITS-1);
204 
205  dataptr[2] = (DCTELEM)
206  RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
207  CONST_BITS-PASS1_BITS);
208  dataptr[6] = (DCTELEM)
209  RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
210  CONST_BITS-PASS1_BITS);
211 
212  /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
213  * i0..i3 in the paper are tmp0..tmp3 here.
214  */
215 
216  tmp12 = tmp0 + tmp2;
217  tmp13 = tmp1 + tmp3;
218 
219  z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
220  /* Add fudge factor here for final descale. */
221  z1 += ONE << (CONST_BITS-PASS1_BITS-1);
222 
223  tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
224  tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
225  tmp12 += z1;
226  tmp13 += z1;
227 
228  z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
229  tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
230  tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
231  tmp0 += z1 + tmp12;
232  tmp3 += z1 + tmp13;
233 
234  z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
235  tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
236  tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
237  tmp1 += z1 + tmp13;
238  tmp2 += z1 + tmp12;
239 
240  dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS);
241  dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS);
242  dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
243  dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS);
244 
245  dataptr += DCTSIZE; /* advance pointer to next row */
246  }
247 
248  /* Pass 2: process columns.
249  * We remove the PASS1_BITS scaling, but leave the results scaled up
250  * by an overall factor of 8.
251  * cK represents sqrt(2) * cos(K*pi/16).
252  */
253 
254  dataptr = data;
255  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
256  /* Even part per LL&M figure 1 --- note that published figure is faulty;
257  * rotator "c1" should be "c6".
258  */
259 
260  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
261  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
262  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
263  tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
264 
265  /* Add fudge factor here for final descale. */
266  tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
267  tmp12 = tmp0 - tmp3;
268  tmp11 = tmp1 + tmp2;
269  tmp13 = tmp1 - tmp2;
270 
271  tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
272  tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
273  tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
274  tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
275 
276  dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
277  dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
278 
279  z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
280  /* Add fudge factor here for final descale. */
281  z1 += ONE << (CONST_BITS+PASS1_BITS-1);
282 
283  dataptr[DCTSIZE*2] = (DCTELEM)
284  RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
285  CONST_BITS+PASS1_BITS);
286  dataptr[DCTSIZE*6] = (DCTELEM)
287  RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
288  CONST_BITS+PASS1_BITS);
289 
290  /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
291  * i0..i3 in the paper are tmp0..tmp3 here.
292  */
293 
294  tmp12 = tmp0 + tmp2;
295  tmp13 = tmp1 + tmp3;
296 
297  z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
298  /* Add fudge factor here for final descale. */
299  z1 += ONE << (CONST_BITS+PASS1_BITS-1);
300 
301  tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
302  tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
303  tmp12 += z1;
304  tmp13 += z1;
305 
306  z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
307  tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
308  tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
309  tmp0 += z1 + tmp12;
310  tmp3 += z1 + tmp13;
311 
312  z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
313  tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
314  tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
315  tmp1 += z1 + tmp13;
316  tmp2 += z1 + tmp12;
317 
318  dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS1_BITS);
319  dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS1_BITS);
320  dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS1_BITS);
321  dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS1_BITS);
322 
323  dataptr++; /* advance pointer to next column */
324  }
325 }
326 
327 #ifdef DCT_SCALING_SUPPORTED
328 
329 
330 /*
331  * Perform the forward DCT on a 7x7 sample block.
332  */
333 
334 GLOBAL(void)
336 {
337  INT32 tmp0, tmp1, tmp2, tmp3;
338  INT32 tmp10, tmp11, tmp12;
339  INT32 z1, z2, z3;
340  DCTELEM *dataptr;
341  JSAMPROW elemptr;
342  int ctr;
344 
345  /* Pre-zero output coefficient block. */
347 
348  /* Pass 1: process rows.
349  * Note results are scaled up by sqrt(8) compared to a true DCT;
350  * furthermore, we scale the results by 2**PASS1_BITS.
351  * cK represents sqrt(2) * cos(K*pi/14).
352  */
353 
354  dataptr = data;
355  for (ctr = 0; ctr < 7; ctr++) {
356  elemptr = sample_data[ctr] + start_col;
357 
358  /* Even part */
359 
360  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
361  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
362  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
363  tmp3 = GETJSAMPLE(elemptr[3]);
364 
365  tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
366  tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
367  tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
368 
369  z1 = tmp0 + tmp2;
370  /* Apply unsigned->signed conversion. */
371  dataptr[0] = (DCTELEM)
372  ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
373  tmp3 += tmp3;
374  z1 -= tmp3;
375  z1 -= tmp3;
376  z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
377  z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
378  z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
379  dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
380  z1 -= z2;
381  z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
382  dataptr[4] = (DCTELEM)
383  DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
384  CONST_BITS-PASS1_BITS);
385  dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
386 
387  /* Odd part */
388 
389  tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
390  tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
391  tmp0 = tmp1 - tmp2;
392  tmp1 += tmp2;
393  tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
394  tmp1 += tmp2;
395  tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
396  tmp0 += tmp3;
397  tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
398 
399  dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
400  dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
401  dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
402 
403  dataptr += DCTSIZE; /* advance pointer to next row */
404  }
405 
406  /* Pass 2: process columns.
407  * We remove the PASS1_BITS scaling, but leave the results scaled up
408  * by an overall factor of 8.
409  * We must also scale the output by (8/7)**2 = 64/49, which we fold
410  * into the constant multipliers:
411  * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
412  */
413 
414  dataptr = data;
415  for (ctr = 0; ctr < 7; ctr++) {
416  /* Even part */
417 
418  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
419  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
420  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
421  tmp3 = dataptr[DCTSIZE*3];
422 
423  tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
424  tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
425  tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
426 
427  z1 = tmp0 + tmp2;
428  dataptr[DCTSIZE*0] = (DCTELEM)
429  DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
430  CONST_BITS+PASS1_BITS);
431  tmp3 += tmp3;
432  z1 -= tmp3;
433  z1 -= tmp3;
434  z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
435  z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
436  z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
437  dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
438  z1 -= z2;
439  z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
440  dataptr[DCTSIZE*4] = (DCTELEM)
441  DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
442  CONST_BITS+PASS1_BITS);
443  dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
444 
445  /* Odd part */
446 
447  tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
448  tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
449  tmp0 = tmp1 - tmp2;
450  tmp1 += tmp2;
451  tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
452  tmp1 += tmp2;
453  tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
454  tmp0 += tmp3;
455  tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
456 
457  dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
458  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
459  dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
460 
461  dataptr++; /* advance pointer to next column */
462  }
463 }
464 
465 
466 /*
467  * Perform the forward DCT on a 6x6 sample block.
468  */
469 
470 GLOBAL(void)
472 {
473  INT32 tmp0, tmp1, tmp2;
474  INT32 tmp10, tmp11, tmp12;
475  DCTELEM *dataptr;
476  JSAMPROW elemptr;
477  int ctr;
479 
480  /* Pre-zero output coefficient block. */
482 
483  /* Pass 1: process rows.
484  * Note results are scaled up by sqrt(8) compared to a true DCT;
485  * furthermore, we scale the results by 2**PASS1_BITS.
486  * cK represents sqrt(2) * cos(K*pi/12).
487  */
488 
489  dataptr = data;
490  for (ctr = 0; ctr < 6; ctr++) {
491  elemptr = sample_data[ctr] + start_col;
492 
493  /* Even part */
494 
495  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
496  tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
497  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
498 
499  tmp10 = tmp0 + tmp2;
500  tmp12 = tmp0 - tmp2;
501 
502  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
503  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
504  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
505 
506  /* Apply unsigned->signed conversion. */
507  dataptr[0] = (DCTELEM)
508  ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
509  dataptr[2] = (DCTELEM)
510  DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
511  CONST_BITS-PASS1_BITS);
512  dataptr[4] = (DCTELEM)
513  DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
514  CONST_BITS-PASS1_BITS);
515 
516  /* Odd part */
517 
518  tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
519  CONST_BITS-PASS1_BITS);
520 
521  dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
522  dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
523  dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
524 
525  dataptr += DCTSIZE; /* advance pointer to next row */
526  }
527 
528  /* Pass 2: process columns.
529  * We remove the PASS1_BITS scaling, but leave the results scaled up
530  * by an overall factor of 8.
531  * We must also scale the output by (8/6)**2 = 16/9, which we fold
532  * into the constant multipliers:
533  * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
534  */
535 
536  dataptr = data;
537  for (ctr = 0; ctr < 6; ctr++) {
538  /* Even part */
539 
540  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
541  tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
542  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
543 
544  tmp10 = tmp0 + tmp2;
545  tmp12 = tmp0 - tmp2;
546 
547  tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
548  tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
549  tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
550 
551  dataptr[DCTSIZE*0] = (DCTELEM)
552  DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
553  CONST_BITS+PASS1_BITS);
554  dataptr[DCTSIZE*2] = (DCTELEM)
555  DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
556  CONST_BITS+PASS1_BITS);
557  dataptr[DCTSIZE*4] = (DCTELEM)
558  DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
559  CONST_BITS+PASS1_BITS);
560 
561  /* Odd part */
562 
563  tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
564 
565  dataptr[DCTSIZE*1] = (DCTELEM)
566  DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
567  CONST_BITS+PASS1_BITS);
568  dataptr[DCTSIZE*3] = (DCTELEM)
569  DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
570  CONST_BITS+PASS1_BITS);
571  dataptr[DCTSIZE*5] = (DCTELEM)
572  DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
573  CONST_BITS+PASS1_BITS);
574 
575  dataptr++; /* advance pointer to next column */
576  }
577 }
578 
579 
580 /*
581  * Perform the forward DCT on a 5x5 sample block.
582  */
583 
584 GLOBAL(void)
586 {
587  INT32 tmp0, tmp1, tmp2;
588  INT32 tmp10, tmp11;
589  DCTELEM *dataptr;
590  JSAMPROW elemptr;
591  int ctr;
593 
594  /* Pre-zero output coefficient block. */
596 
597  /* Pass 1: process rows.
598  * Note results are scaled up by sqrt(8) compared to a true DCT;
599  * furthermore, we scale the results by 2**PASS1_BITS.
600  * We scale the results further by 2 as part of output adaption
601  * scaling for different DCT size.
602  * cK represents sqrt(2) * cos(K*pi/10).
603  */
604 
605  dataptr = data;
606  for (ctr = 0; ctr < 5; ctr++) {
607  elemptr = sample_data[ctr] + start_col;
608 
609  /* Even part */
610 
611  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
612  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
613  tmp2 = GETJSAMPLE(elemptr[2]);
614 
615  tmp10 = tmp0 + tmp1;
616  tmp11 = tmp0 - tmp1;
617 
618  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
619  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
620 
621  /* Apply unsigned->signed conversion. */
622  dataptr[0] = (DCTELEM)
623  ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
624  tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
625  tmp10 -= tmp2 << 2;
626  tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
627  dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
628  dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
629 
630  /* Odd part */
631 
632  tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
633 
634  dataptr[1] = (DCTELEM)
635  DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
636  CONST_BITS-PASS1_BITS-1);
637  dataptr[3] = (DCTELEM)
638  DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
639  CONST_BITS-PASS1_BITS-1);
640 
641  dataptr += DCTSIZE; /* advance pointer to next row */
642  }
643 
644  /* Pass 2: process columns.
645  * We remove the PASS1_BITS scaling, but leave the results scaled up
646  * by an overall factor of 8.
647  * We must also scale the output by (8/5)**2 = 64/25, which we partially
648  * fold into the constant multipliers (other part was done in pass 1):
649  * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
650  */
651 
652  dataptr = data;
653  for (ctr = 0; ctr < 5; ctr++) {
654  /* Even part */
655 
656  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
657  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
658  tmp2 = dataptr[DCTSIZE*2];
659 
660  tmp10 = tmp0 + tmp1;
661  tmp11 = tmp0 - tmp1;
662 
663  tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
664  tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
665 
666  dataptr[DCTSIZE*0] = (DCTELEM)
667  DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
668  CONST_BITS+PASS1_BITS);
669  tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
670  tmp10 -= tmp2 << 2;
671  tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
672  dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
673  dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
674 
675  /* Odd part */
676 
677  tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
678 
679  dataptr[DCTSIZE*1] = (DCTELEM)
680  DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
681  CONST_BITS+PASS1_BITS);
682  dataptr[DCTSIZE*3] = (DCTELEM)
683  DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
684  CONST_BITS+PASS1_BITS);
685 
686  dataptr++; /* advance pointer to next column */
687  }
688 }
689 
690 
691 /*
692  * Perform the forward DCT on a 4x4 sample block.
693  */
694 
695 GLOBAL(void)
697 {
698  INT32 tmp0, tmp1;
699  INT32 tmp10, tmp11;
700  DCTELEM *dataptr;
701  JSAMPROW elemptr;
702  int ctr;
704 
705  /* Pre-zero output coefficient block. */
707 
708  /* Pass 1: process rows.
709  * Note results are scaled up by sqrt(8) compared to a true DCT;
710  * furthermore, we scale the results by 2**PASS1_BITS.
711  * We must also scale the output by (8/4)**2 = 2**2, which we add here.
712  * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
713  */
714 
715  dataptr = data;
716  for (ctr = 0; ctr < 4; ctr++) {
717  elemptr = sample_data[ctr] + start_col;
718 
719  /* Even part */
720 
721  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
722  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
723 
724  tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
725  tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
726 
727  /* Apply unsigned->signed conversion. */
728  dataptr[0] = (DCTELEM)
729  ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
730  dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
731 
732  /* Odd part */
733 
734  tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
735  /* Add fudge factor here for final descale. */
736  tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
737 
738  dataptr[1] = (DCTELEM)
739  RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
740  CONST_BITS-PASS1_BITS-2);
741  dataptr[3] = (DCTELEM)
742  RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
743  CONST_BITS-PASS1_BITS-2);
744 
745  dataptr += DCTSIZE; /* advance pointer to next row */
746  }
747 
748  /* Pass 2: process columns.
749  * We remove the PASS1_BITS scaling, but leave the results scaled up
750  * by an overall factor of 8.
751  * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
752  */
753 
754  dataptr = data;
755  for (ctr = 0; ctr < 4; ctr++) {
756  /* Even part */
757 
758  /* Add fudge factor here for final descale. */
759  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
760  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
761 
762  tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
763  tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
764 
765  dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
766  dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
767 
768  /* Odd part */
769 
770  tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
771  /* Add fudge factor here for final descale. */
772  tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
773 
774  dataptr[DCTSIZE*1] = (DCTELEM)
775  RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
776  CONST_BITS+PASS1_BITS);
777  dataptr[DCTSIZE*3] = (DCTELEM)
778  RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
779  CONST_BITS+PASS1_BITS);
780 
781  dataptr++; /* advance pointer to next column */
782  }
783 }
784 
785 
786 /*
787  * Perform the forward DCT on a 3x3 sample block.
788  */
789 
790 GLOBAL(void)
792 {
793  INT32 tmp0, tmp1, tmp2;
794  DCTELEM *dataptr;
795  JSAMPROW elemptr;
796  int ctr;
798 
799  /* Pre-zero output coefficient block. */
801 
802  /* Pass 1: process rows.
803  * Note results are scaled up by sqrt(8) compared to a true DCT;
804  * furthermore, we scale the results by 2**PASS1_BITS.
805  * We scale the results further by 2**2 as part of output adaption
806  * scaling for different DCT size.
807  * cK represents sqrt(2) * cos(K*pi/6).
808  */
809 
810  dataptr = data;
811  for (ctr = 0; ctr < 3; ctr++) {
812  elemptr = sample_data[ctr] + start_col;
813 
814  /* Even part */
815 
816  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
817  tmp1 = GETJSAMPLE(elemptr[1]);
818 
819  tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
820 
821  /* Apply unsigned->signed conversion. */
822  dataptr[0] = (DCTELEM)
823  ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
824  dataptr[2] = (DCTELEM)
825  DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
826  CONST_BITS-PASS1_BITS-2);
827 
828  /* Odd part */
829 
830  dataptr[1] = (DCTELEM)
831  DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
832  CONST_BITS-PASS1_BITS-2);
833 
834  dataptr += DCTSIZE; /* advance pointer to next row */
835  }
836 
837  /* Pass 2: process columns.
838  * We remove the PASS1_BITS scaling, but leave the results scaled up
839  * by an overall factor of 8.
840  * We must also scale the output by (8/3)**2 = 64/9, which we partially
841  * fold into the constant multipliers (other part was done in pass 1):
842  * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
843  */
844 
845  dataptr = data;
846  for (ctr = 0; ctr < 3; ctr++) {
847  /* Even part */
848 
849  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
850  tmp1 = dataptr[DCTSIZE*1];
851 
852  tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
853 
854  dataptr[DCTSIZE*0] = (DCTELEM)
855  DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
856  CONST_BITS+PASS1_BITS);
857  dataptr[DCTSIZE*2] = (DCTELEM)
858  DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
859  CONST_BITS+PASS1_BITS);
860 
861  /* Odd part */
862 
863  dataptr[DCTSIZE*1] = (DCTELEM)
864  DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
865  CONST_BITS+PASS1_BITS);
866 
867  dataptr++; /* advance pointer to next column */
868  }
869 }
870 
871 
872 /*
873  * Perform the forward DCT on a 2x2 sample block.
874  */
875 
876 GLOBAL(void)
878 {
879  DCTELEM tmp0, tmp1, tmp2, tmp3;
880  JSAMPROW elemptr;
881 
882  /* Pre-zero output coefficient block. */
884 
885  /* Pass 1: process rows.
886  * Note results are scaled up by sqrt(8) compared to a true DCT.
887  */
888 
889  /* Row 0 */
890  elemptr = sample_data[0] + start_col;
891 
892  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
893  tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
894 
895  /* Row 1 */
896  elemptr = sample_data[1] + start_col;
897 
898  tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
899  tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
900 
901  /* Pass 2: process columns.
902  * We leave the results scaled up by an overall factor of 8.
903  * We must also scale the output by (8/2)**2 = 2**4.
904  */
905 
906  /* Column 0 */
907  /* Apply unsigned->signed conversion. */
908  data[DCTSIZE*0] = (tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4;
909  data[DCTSIZE*1] = (tmp0 - tmp2) << 4;
910 
911  /* Column 1 */
912  data[DCTSIZE*0+1] = (tmp1 + tmp3) << 4;
913  data[DCTSIZE*1+1] = (tmp1 - tmp3) << 4;
914 }
915 
916 
917 /*
918  * Perform the forward DCT on a 1x1 sample block.
919  */
920 
921 GLOBAL(void)
923 {
924  DCTELEM dcval;
925 
926  /* Pre-zero output coefficient block. */
928 
929  dcval = GETJSAMPLE(sample_data[0][start_col]);
930 
931  /* We leave the result scaled up by an overall factor of 8. */
932  /* We must also scale the output by (8/1)**2 = 2**6. */
933  /* Apply unsigned->signed conversion. */
934  data[0] = (dcval - CENTERJSAMPLE) << 6;
935 }
936 
937 
938 /*
939  * Perform the forward DCT on a 9x9 sample block.
940  */
941 
942 GLOBAL(void)
944 {
945  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
946  INT32 tmp10, tmp11, tmp12, tmp13;
947  INT32 z1, z2;
948  DCTELEM workspace[8];
949  DCTELEM *dataptr;
950  DCTELEM *wsptr;
951  JSAMPROW elemptr;
952  int ctr;
954 
955  /* Pass 1: process rows.
956  * Note results are scaled up by sqrt(8) compared to a true DCT;
957  * we scale the results further by 2 as part of output adaption
958  * scaling for different DCT size.
959  * cK represents sqrt(2) * cos(K*pi/18).
960  */
961 
962  dataptr = data;
963  ctr = 0;
964  for (;;) {
965  elemptr = sample_data[ctr] + start_col;
966 
967  /* Even part */
968 
969  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
970  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
971  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
972  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
973  tmp4 = GETJSAMPLE(elemptr[4]);
974 
975  tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
976  tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
977  tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
978  tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
979 
980  z1 = tmp0 + tmp2 + tmp3;
981  z2 = tmp1 + tmp4;
982  /* Apply unsigned->signed conversion. */
983  dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
984  dataptr[6] = (DCTELEM)
985  DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)), /* c6 */
986  CONST_BITS-1);
987  z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049)); /* c2 */
988  z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
989  dataptr[2] = (DCTELEM)
990  DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441)) /* c4 */
991  + z1 + z2, CONST_BITS-1);
992  dataptr[4] = (DCTELEM)
993  DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608)) /* c8 */
994  + z1 - z2, CONST_BITS-1);
995 
996  /* Odd part */
997 
998  dataptr[3] = (DCTELEM)
999  DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
1000  CONST_BITS-1);
1001 
1002  tmp11 = MULTIPLY(tmp11, FIX(1.224744871)); /* c3 */
1003  tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
1004  tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
1005 
1006  dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
1007 
1008  tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
1009 
1010  dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
1011  dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
1012 
1013  ctr++;
1014 
1015  if (ctr != DCTSIZE) {
1016  if (ctr == 9)
1017  break; /* Done. */
1018  dataptr += DCTSIZE; /* advance pointer to next row */
1019  } else
1020  dataptr = workspace; /* switch pointer to extended workspace */
1021  }
1022 
1023  /* Pass 2: process columns.
1024  * We leave the results scaled up by an overall factor of 8.
1025  * We must also scale the output by (8/9)**2 = 64/81, which we partially
1026  * fold into the constant multipliers and final/initial shifting:
1027  * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
1028  */
1029 
1030  dataptr = data;
1031  wsptr = workspace;
1032  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1033  /* Even part */
1034 
1035  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
1036  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
1037  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
1038  tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
1039  tmp4 = dataptr[DCTSIZE*4];
1040 
1041  tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
1042  tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
1043  tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
1044  tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
1045 
1046  z1 = tmp0 + tmp2 + tmp3;
1047  z2 = tmp1 + tmp4;
1048  dataptr[DCTSIZE*0] = (DCTELEM)
1049  DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)), /* 128/81 */
1050  CONST_BITS+2);
1051  dataptr[DCTSIZE*6] = (DCTELEM)
1052  DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)), /* c6 */
1053  CONST_BITS+2);
1054  z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287)); /* c2 */
1055  z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
1056  dataptr[DCTSIZE*2] = (DCTELEM)
1057  DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190)) /* c4 */
1058  + z1 + z2, CONST_BITS+2);
1059  dataptr[DCTSIZE*4] = (DCTELEM)
1060  DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096)) /* c8 */
1061  + z1 - z2, CONST_BITS+2);
1062 
1063  /* Odd part */
1064 
1065  dataptr[DCTSIZE*3] = (DCTELEM)
1066  DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
1067  CONST_BITS+2);
1068 
1069  tmp11 = MULTIPLY(tmp11, FIX(1.935399303)); /* c3 */
1070  tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
1071  tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
1072 
1073  dataptr[DCTSIZE*1] = (DCTELEM)
1074  DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
1075 
1076  tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
1077 
1078  dataptr[DCTSIZE*5] = (DCTELEM)
1079  DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
1080  dataptr[DCTSIZE*7] = (DCTELEM)
1081  DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
1082 
1083  dataptr++; /* advance pointer to next column */
1084  wsptr++; /* advance pointer to next column */
1085  }
1086 }
1087 
1088 
1089 /*
1090  * Perform the forward DCT on a 10x10 sample block.
1091  */
1092 
1093 GLOBAL(void)
1094 jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1095 {
1096  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1097  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1098  DCTELEM workspace[8*2];
1099  DCTELEM *dataptr;
1100  DCTELEM *wsptr;
1101  JSAMPROW elemptr;
1102  int ctr;
1103  SHIFT_TEMPS
1104 
1105  /* Pass 1: process rows.
1106  * Note results are scaled up by sqrt(8) compared to a true DCT;
1107  * we scale the results further by 2 as part of output adaption
1108  * scaling for different DCT size.
1109  * cK represents sqrt(2) * cos(K*pi/20).
1110  */
1111 
1112  dataptr = data;
1113  ctr = 0;
1114  for (;;) {
1115  elemptr = sample_data[ctr] + start_col;
1116 
1117  /* Even part */
1118 
1119  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
1120  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
1121  tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
1122  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
1123  tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
1124 
1125  tmp10 = tmp0 + tmp4;
1126  tmp13 = tmp0 - tmp4;
1127  tmp11 = tmp1 + tmp3;
1128  tmp14 = tmp1 - tmp3;
1129 
1130  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
1131  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
1132  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
1133  tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
1134  tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
1135 
1136  /* Apply unsigned->signed conversion. */
1137  dataptr[0] = (DCTELEM)
1138  ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
1139  tmp12 += tmp12;
1140  dataptr[4] = (DCTELEM)
1141  DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
1142  MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
1143  CONST_BITS-1);
1144  tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
1145  dataptr[2] = (DCTELEM)
1146  DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
1147  CONST_BITS-1);
1148  dataptr[6] = (DCTELEM)
1149  DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
1150  CONST_BITS-1);
1151 
1152  /* Odd part */
1153 
1154  tmp10 = tmp0 + tmp4;
1155  tmp11 = tmp1 - tmp3;
1156  dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
1157  tmp2 <<= CONST_BITS;
1158  dataptr[1] = (DCTELEM)
1159  DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
1160  MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
1161  MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
1162  MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
1163  CONST_BITS-1);
1164  tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
1165  MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
1166  tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
1167  (tmp11 << (CONST_BITS - 1)) - tmp2;
1168  dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
1169  dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
1170 
1171  ctr++;
1172 
1173  if (ctr != DCTSIZE) {
1174  if (ctr == 10)
1175  break; /* Done. */
1176  dataptr += DCTSIZE; /* advance pointer to next row */
1177  } else
1178  dataptr = workspace; /* switch pointer to extended workspace */
1179  }
1180 
1181  /* Pass 2: process columns.
1182  * We leave the results scaled up by an overall factor of 8.
1183  * We must also scale the output by (8/10)**2 = 16/25, which we partially
1184  * fold into the constant multipliers and final/initial shifting:
1185  * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
1186  */
1187 
1188  dataptr = data;
1189  wsptr = workspace;
1190  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1191  /* Even part */
1192 
1193  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
1194  tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
1195  tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
1196  tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
1197  tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
1198 
1199  tmp10 = tmp0 + tmp4;
1200  tmp13 = tmp0 - tmp4;
1201  tmp11 = tmp1 + tmp3;
1202  tmp14 = tmp1 - tmp3;
1203 
1204  tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
1205  tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
1206  tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
1207  tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
1208  tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
1209 
1210  dataptr[DCTSIZE*0] = (DCTELEM)
1211  DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
1212  CONST_BITS+2);
1213  tmp12 += tmp12;
1214  dataptr[DCTSIZE*4] = (DCTELEM)
1215  DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
1216  MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
1217  CONST_BITS+2);
1218  tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
1219  dataptr[DCTSIZE*2] = (DCTELEM)
1220  DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
1221  CONST_BITS+2);
1222  dataptr[DCTSIZE*6] = (DCTELEM)
1223  DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
1224  CONST_BITS+2);
1225 
1226  /* Odd part */
1227 
1228  tmp10 = tmp0 + tmp4;
1229  tmp11 = tmp1 - tmp3;
1230  dataptr[DCTSIZE*5] = (DCTELEM)
1231  DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
1232  CONST_BITS+2);
1233  tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
1234  dataptr[DCTSIZE*1] = (DCTELEM)
1235  DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
1236  MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
1237  MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
1238  MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
1239  CONST_BITS+2);
1240  tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
1241  MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
1242  tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
1243  MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
1244  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
1245  dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
1246 
1247  dataptr++; /* advance pointer to next column */
1248  wsptr++; /* advance pointer to next column */
1249  }
1250 }
1251 
1252 
1253 /*
1254  * Perform the forward DCT on an 11x11 sample block.
1255  */
1256 
1257 GLOBAL(void)
1258 jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1259 {
1260  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1261  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1262  INT32 z1, z2, z3;
1263  DCTELEM workspace[8*3];
1264  DCTELEM *dataptr;
1265  DCTELEM *wsptr;
1266  JSAMPROW elemptr;
1267  int ctr;
1268  SHIFT_TEMPS
1269 
1270  /* Pass 1: process rows.
1271  * Note results are scaled up by sqrt(8) compared to a true DCT;
1272  * we scale the results further by 2 as part of output adaption
1273  * scaling for different DCT size.
1274  * cK represents sqrt(2) * cos(K*pi/22).
1275  */
1276 
1277  dataptr = data;
1278  ctr = 0;
1279  for (;;) {
1280  elemptr = sample_data[ctr] + start_col;
1281 
1282  /* Even part */
1283 
1284  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
1285  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
1286  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
1287  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
1288  tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
1289  tmp5 = GETJSAMPLE(elemptr[5]);
1290 
1291  tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
1292  tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
1293  tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
1294  tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
1295  tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
1296 
1297  /* Apply unsigned->signed conversion. */
1298  dataptr[0] = (DCTELEM)
1299  ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
1300  tmp5 += tmp5;
1301  tmp0 -= tmp5;
1302  tmp1 -= tmp5;
1303  tmp2 -= tmp5;
1304  tmp3 -= tmp5;
1305  tmp4 -= tmp5;
1306  z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) + /* c2 */
1307  MULTIPLY(tmp2 + tmp4, FIX(0.201263574)); /* c10 */
1308  z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931)); /* c6 */
1309  z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156)); /* c4 */
1310  dataptr[2] = (DCTELEM)
1311  DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
1312  - MULTIPLY(tmp4, FIX(1.390975730)), /* c4+c10 */
1313  CONST_BITS-1);
1314  dataptr[4] = (DCTELEM)
1315  DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
1316  - MULTIPLY(tmp2, FIX(1.356927976)) /* c2 */
1317  + MULTIPLY(tmp4, FIX(0.587485545)), /* c8 */
1318  CONST_BITS-1);
1319  dataptr[6] = (DCTELEM)
1320  DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
1321  - MULTIPLY(tmp2, FIX(0.788749120)), /* c8+c10 */
1322  CONST_BITS-1);
1323 
1324  /* Odd part */
1325 
1326  tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905)); /* c3 */
1327  tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298)); /* c5 */
1328  tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576)); /* c7 */
1329  tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
1330  + MULTIPLY(tmp14, FIX(0.398430003)); /* c9 */
1331  tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576)); /* -c7 */
1332  tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907)); /* -c1 */
1333  tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
1334  - MULTIPLY(tmp14, FIX(1.068791298)); /* c5 */
1335  tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003)); /* c9 */
1336  tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
1337  + MULTIPLY(tmp14, FIX(1.399818907)); /* c1 */
1338  tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
1339  - MULTIPLY(tmp14, FIX(1.286413905)); /* c3 */
1340 
1341  dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
1342  dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
1343  dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
1344  dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
1345 
1346  ctr++;
1347 
1348  if (ctr != DCTSIZE) {
1349  if (ctr == 11)
1350  break; /* Done. */
1351  dataptr += DCTSIZE; /* advance pointer to next row */
1352  } else
1353  dataptr = workspace; /* switch pointer to extended workspace */
1354  }
1355 
1356  /* Pass 2: process columns.
1357  * We leave the results scaled up by an overall factor of 8.
1358  * We must also scale the output by (8/11)**2 = 64/121, which we partially
1359  * fold into the constant multipliers and final/initial shifting:
1360  * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
1361  */
1362 
1363  dataptr = data;
1364  wsptr = workspace;
1365  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1366  /* Even part */
1367 
1368  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
1369  tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
1370  tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
1371  tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
1372  tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
1373  tmp5 = dataptr[DCTSIZE*5];
1374 
1375  tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
1376  tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
1377  tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
1378  tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
1379  tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
1380 
1381  dataptr[DCTSIZE*0] = (DCTELEM)
1382  DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
1383  FIX(1.057851240)), /* 128/121 */
1384  CONST_BITS+2);
1385  tmp5 += tmp5;
1386  tmp0 -= tmp5;
1387  tmp1 -= tmp5;
1388  tmp2 -= tmp5;
1389  tmp3 -= tmp5;
1390  tmp4 -= tmp5;
1391  z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) + /* c2 */
1392  MULTIPLY(tmp2 + tmp4, FIX(0.212906922)); /* c10 */
1393  z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713)); /* c6 */
1394  z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479)); /* c4 */
1395  dataptr[DCTSIZE*2] = (DCTELEM)
1396  DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
1397  - MULTIPLY(tmp4, FIX(1.471445400)), /* c4+c10 */
1398  CONST_BITS+2);
1399  dataptr[DCTSIZE*4] = (DCTELEM)
1400  DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
1401  - MULTIPLY(tmp2, FIX(1.435427942)) /* c2 */
1402  + MULTIPLY(tmp4, FIX(0.621472312)), /* c8 */
1403  CONST_BITS+2);
1404  dataptr[DCTSIZE*6] = (DCTELEM)
1405  DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
1406  - MULTIPLY(tmp2, FIX(0.834379234)), /* c8+c10 */
1407  CONST_BITS+2);
1408 
1409  /* Odd part */
1410 
1411  tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544)); /* c3 */
1412  tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199)); /* c5 */
1413  tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568)); /* c7 */
1414  tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
1415  + MULTIPLY(tmp14, FIX(0.421479672)); /* c9 */
1416  tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568)); /* -c7 */
1417  tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167)); /* -c1 */
1418  tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
1419  - MULTIPLY(tmp14, FIX(1.130622199)); /* c5 */
1420  tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672)); /* c9 */
1421  tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
1422  + MULTIPLY(tmp14, FIX(1.480800167)); /* c1 */
1423  tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
1424  - MULTIPLY(tmp14, FIX(1.360834544)); /* c3 */
1425 
1426  dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
1427  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
1428  dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
1429  dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
1430 
1431  dataptr++; /* advance pointer to next column */
1432  wsptr++; /* advance pointer to next column */
1433  }
1434 }
1435 
1436 
1437 /*
1438  * Perform the forward DCT on a 12x12 sample block.
1439  */
1440 
1441 GLOBAL(void)
1442 jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1443 {
1444  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1445  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1446  DCTELEM workspace[8*4];
1447  DCTELEM *dataptr;
1448  DCTELEM *wsptr;
1449  JSAMPROW elemptr;
1450  int ctr;
1451  SHIFT_TEMPS
1452 
1453  /* Pass 1: process rows.
1454  * Note results are scaled up by sqrt(8) compared to a true DCT.
1455  * cK represents sqrt(2) * cos(K*pi/24).
1456  */
1457 
1458  dataptr = data;
1459  ctr = 0;
1460  for (;;) {
1461  elemptr = sample_data[ctr] + start_col;
1462 
1463  /* Even part */
1464 
1465  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
1466  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
1467  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
1468  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
1469  tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
1470  tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
1471 
1472  tmp10 = tmp0 + tmp5;
1473  tmp13 = tmp0 - tmp5;
1474  tmp11 = tmp1 + tmp4;
1475  tmp14 = tmp1 - tmp4;
1476  tmp12 = tmp2 + tmp3;
1477  tmp15 = tmp2 - tmp3;
1478 
1479  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
1480  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
1481  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
1482  tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
1483  tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
1484  tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
1485 
1486  /* Apply unsigned->signed conversion. */
1487  dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
1488  dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
1489  dataptr[4] = (DCTELEM)
1490  DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
1491  CONST_BITS);
1492  dataptr[2] = (DCTELEM)
1493  DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
1494  CONST_BITS);
1495 
1496  /* Odd part */
1497 
1498  tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
1499  tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
1500  tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
1501  tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
1502  tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
1503  tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
1504  + MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
1505  tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
1506  tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
1507  + MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
1508  tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
1509  - MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
1510  tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
1511  - MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
1512 
1513  dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
1514  dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
1515  dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
1516  dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
1517 
1518  ctr++;
1519 
1520  if (ctr != DCTSIZE) {
1521  if (ctr == 12)
1522  break; /* Done. */
1523  dataptr += DCTSIZE; /* advance pointer to next row */
1524  } else
1525  dataptr = workspace; /* switch pointer to extended workspace */
1526  }
1527 
1528  /* Pass 2: process columns.
1529  * We leave the results scaled up by an overall factor of 8.
1530  * We must also scale the output by (8/12)**2 = 4/9, which we partially
1531  * fold into the constant multipliers and final shifting:
1532  * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
1533  */
1534 
1535  dataptr = data;
1536  wsptr = workspace;
1537  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1538  /* Even part */
1539 
1540  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
1541  tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
1542  tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
1543  tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
1544  tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
1545  tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
1546 
1547  tmp10 = tmp0 + tmp5;
1548  tmp13 = tmp0 - tmp5;
1549  tmp11 = tmp1 + tmp4;
1550  tmp14 = tmp1 - tmp4;
1551  tmp12 = tmp2 + tmp3;
1552  tmp15 = tmp2 - tmp3;
1553 
1554  tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
1555  tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
1556  tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
1557  tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
1558  tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
1559  tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
1560 
1561  dataptr[DCTSIZE*0] = (DCTELEM)
1562  DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
1563  CONST_BITS+1);
1564  dataptr[DCTSIZE*6] = (DCTELEM)
1565  DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
1566  CONST_BITS+1);
1567  dataptr[DCTSIZE*4] = (DCTELEM)
1568  DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
1569  CONST_BITS+1);
1570  dataptr[DCTSIZE*2] = (DCTELEM)
1571  DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
1572  MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
1573  CONST_BITS+1);
1574 
1575  /* Odd part */
1576 
1577  tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
1578  tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
1579  tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
1580  tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
1581  tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
1582  tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
1583  + MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
1584  tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
1585  tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
1586  + MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
1587  tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
1588  - MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
1589  tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
1590  - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
1591 
1592  dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
1593  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
1594  dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
1595  dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
1596 
1597  dataptr++; /* advance pointer to next column */
1598  wsptr++; /* advance pointer to next column */
1599  }
1600 }
1601 
1602 
1603 /*
1604  * Perform the forward DCT on a 13x13 sample block.
1605  */
1606 
1607 GLOBAL(void)
1608 jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1609 {
1610  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1611  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1612  INT32 z1, z2;
1613  DCTELEM workspace[8*5];
1614  DCTELEM *dataptr;
1615  DCTELEM *wsptr;
1616  JSAMPROW elemptr;
1617  int ctr;
1618  SHIFT_TEMPS
1619 
1620  /* Pass 1: process rows.
1621  * Note results are scaled up by sqrt(8) compared to a true DCT.
1622  * cK represents sqrt(2) * cos(K*pi/26).
1623  */
1624 
1625  dataptr = data;
1626  ctr = 0;
1627  for (;;) {
1628  elemptr = sample_data[ctr] + start_col;
1629 
1630  /* Even part */
1631 
1632  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
1633  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
1634  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
1635  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
1636  tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
1637  tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
1638  tmp6 = GETJSAMPLE(elemptr[6]);
1639 
1640  tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
1641  tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
1642  tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
1643  tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
1644  tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
1645  tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
1646 
1647  /* Apply unsigned->signed conversion. */
1648  dataptr[0] = (DCTELEM)
1649  (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
1650  tmp6 += tmp6;
1651  tmp0 -= tmp6;
1652  tmp1 -= tmp6;
1653  tmp2 -= tmp6;
1654  tmp3 -= tmp6;
1655  tmp4 -= tmp6;
1656  tmp5 -= tmp6;
1657  dataptr[2] = (DCTELEM)
1658  DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) + /* c2 */
1659  MULTIPLY(tmp1, FIX(1.058554052)) + /* c6 */
1660  MULTIPLY(tmp2, FIX(0.501487041)) - /* c10 */
1661  MULTIPLY(tmp3, FIX(0.170464608)) - /* c12 */
1662  MULTIPLY(tmp4, FIX(0.803364869)) - /* c8 */
1663  MULTIPLY(tmp5, FIX(1.252223920)), /* c4 */
1664  CONST_BITS);
1665  z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
1666  MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
1667  MULTIPLY(tmp1 - tmp5, FIX(0.316450131)); /* (c8-c12)/2 */
1668  z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
1669  MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
1670  MULTIPLY(tmp1 + tmp5, FIX(0.486914739)); /* (c8+c12)/2 */
1671 
1672  dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
1673  dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
1674 
1675  /* Odd part */
1676 
1677  tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651)); /* c3 */
1678  tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945)); /* c5 */
1679  tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) + /* c7 */
1680  MULTIPLY(tmp14 + tmp15, FIX(0.338443458)); /* c11 */
1681  tmp0 = tmp1 + tmp2 + tmp3 -
1682  MULTIPLY(tmp10, FIX(2.020082300)) + /* c3+c5+c7-c1 */
1683  MULTIPLY(tmp14, FIX(0.318774355)); /* c9-c11 */
1684  tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) - /* c7 */
1685  MULTIPLY(tmp11 + tmp12, FIX(0.338443458)); /* c11 */
1686  tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
1687  tmp1 += tmp4 + tmp5 +
1688  MULTIPLY(tmp11, FIX(0.837223564)) - /* c5+c9+c11-c3 */
1689  MULTIPLY(tmp14, FIX(2.341699410)); /* c1+c7 */
1690  tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
1691  tmp2 += tmp4 + tmp6 -
1692  MULTIPLY(tmp12, FIX(1.572116027)) + /* c1+c5-c9-c11 */
1693  MULTIPLY(tmp15, FIX(2.260109708)); /* c3+c7 */
1694  tmp3 += tmp5 + tmp6 +
1695  MULTIPLY(tmp13, FIX(2.205608352)) - /* c3+c5+c9-c7 */
1696  MULTIPLY(tmp15, FIX(1.742345811)); /* c1+c11 */
1697 
1698  dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
1699  dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
1700  dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
1701  dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
1702 
1703  ctr++;
1704 
1705  if (ctr != DCTSIZE) {
1706  if (ctr == 13)
1707  break; /* Done. */
1708  dataptr += DCTSIZE; /* advance pointer to next row */
1709  } else
1710  dataptr = workspace; /* switch pointer to extended workspace */
1711  }
1712 
1713  /* Pass 2: process columns.
1714  * We leave the results scaled up by an overall factor of 8.
1715  * We must also scale the output by (8/13)**2 = 64/169, which we partially
1716  * fold into the constant multipliers and final shifting:
1717  * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
1718  */
1719 
1720  dataptr = data;
1721  wsptr = workspace;
1722  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1723  /* Even part */
1724 
1725  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
1726  tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
1727  tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
1728  tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
1729  tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
1730  tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
1731  tmp6 = dataptr[DCTSIZE*6];
1732 
1733  tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
1734  tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
1735  tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
1736  tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
1737  tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
1738  tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
1739 
1740  dataptr[DCTSIZE*0] = (DCTELEM)
1741  DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
1742  FIX(0.757396450)), /* 128/169 */
1743  CONST_BITS+1);
1744  tmp6 += tmp6;
1745  tmp0 -= tmp6;
1746  tmp1 -= tmp6;
1747  tmp2 -= tmp6;
1748  tmp3 -= tmp6;
1749  tmp4 -= tmp6;
1750  tmp5 -= tmp6;
1751  dataptr[DCTSIZE*2] = (DCTELEM)
1752  DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) + /* c2 */
1753  MULTIPLY(tmp1, FIX(0.801745081)) + /* c6 */
1754  MULTIPLY(tmp2, FIX(0.379824504)) - /* c10 */
1755  MULTIPLY(tmp3, FIX(0.129109289)) - /* c12 */
1756  MULTIPLY(tmp4, FIX(0.608465700)) - /* c8 */
1757  MULTIPLY(tmp5, FIX(0.948429952)), /* c4 */
1758  CONST_BITS+1);
1759  z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
1760  MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
1761  MULTIPLY(tmp1 - tmp5, FIX(0.239678205)); /* (c8-c12)/2 */
1762  z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
1763  MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
1764  MULTIPLY(tmp1 + tmp5, FIX(0.368787494)); /* (c8+c12)/2 */
1765 
1766  dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
1767  dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
1768 
1769  /* Odd part */
1770 
1771  tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908)); /* c3 */
1772  tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751)); /* c5 */
1773  tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) + /* c7 */
1774  MULTIPLY(tmp14 + tmp15, FIX(0.256335874)); /* c11 */
1775  tmp0 = tmp1 + tmp2 + tmp3 -
1776  MULTIPLY(tmp10, FIX(1.530003162)) + /* c3+c5+c7-c1 */
1777  MULTIPLY(tmp14, FIX(0.241438564)); /* c9-c11 */
1778  tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) - /* c7 */
1779  MULTIPLY(tmp11 + tmp12, FIX(0.256335874)); /* c11 */
1780  tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
1781  tmp1 += tmp4 + tmp5 +
1782  MULTIPLY(tmp11, FIX(0.634110155)) - /* c5+c9+c11-c3 */
1783  MULTIPLY(tmp14, FIX(1.773594819)); /* c1+c7 */
1784  tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
1785  tmp2 += tmp4 + tmp6 -
1786  MULTIPLY(tmp12, FIX(1.190715098)) + /* c1+c5-c9-c11 */
1787  MULTIPLY(tmp15, FIX(1.711799069)); /* c3+c7 */
1788  tmp3 += tmp5 + tmp6 +
1789  MULTIPLY(tmp13, FIX(1.670519935)) - /* c3+c5+c9-c7 */
1790  MULTIPLY(tmp15, FIX(1.319646532)); /* c1+c11 */
1791 
1792  dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
1793  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
1794  dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
1795  dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
1796 
1797  dataptr++; /* advance pointer to next column */
1798  wsptr++; /* advance pointer to next column */
1799  }
1800 }
1801 
1802 
1803 /*
1804  * Perform the forward DCT on a 14x14 sample block.
1805  */
1806 
1807 GLOBAL(void)
1808 jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1809 {
1810  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1811  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1812  DCTELEM workspace[8*6];
1813  DCTELEM *dataptr;
1814  DCTELEM *wsptr;
1815  JSAMPROW elemptr;
1816  int ctr;
1817  SHIFT_TEMPS
1818 
1819  /* Pass 1: process rows.
1820  * Note results are scaled up by sqrt(8) compared to a true DCT.
1821  * cK represents sqrt(2) * cos(K*pi/28).
1822  */
1823 
1824  dataptr = data;
1825  ctr = 0;
1826  for (;;) {
1827  elemptr = sample_data[ctr] + start_col;
1828 
1829  /* Even part */
1830 
1831  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
1832  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
1833  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
1834  tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
1835  tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
1836  tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
1837  tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
1838 
1839  tmp10 = tmp0 + tmp6;
1840  tmp14 = tmp0 - tmp6;
1841  tmp11 = tmp1 + tmp5;
1842  tmp15 = tmp1 - tmp5;
1843  tmp12 = tmp2 + tmp4;
1844  tmp16 = tmp2 - tmp4;
1845 
1846  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
1847  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
1848  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
1849  tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
1850  tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
1851  tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
1852  tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
1853 
1854  /* Apply unsigned->signed conversion. */
1855  dataptr[0] = (DCTELEM)
1856  (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
1857  tmp13 += tmp13;
1858  dataptr[4] = (DCTELEM)
1859  DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
1860  MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
1861  MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
1862  CONST_BITS);
1863 
1864  tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
1865 
1866  dataptr[2] = (DCTELEM)
1867  DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
1868  + MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
1869  CONST_BITS);
1870  dataptr[6] = (DCTELEM)
1871  DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
1872  - MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
1873  CONST_BITS);
1874 
1875  /* Odd part */
1876 
1877  tmp10 = tmp1 + tmp2;
1878  tmp11 = tmp5 - tmp4;
1879  dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
1880  tmp3 <<= CONST_BITS;
1881  tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
1882  tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
1883  tmp10 += tmp11 - tmp3;
1884  tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
1885  MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
1886  dataptr[5] = (DCTELEM)
1887  DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
1888  + MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
1889  CONST_BITS);
1890  tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
1891  MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
1892  dataptr[3] = (DCTELEM)
1893  DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
1894  - MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
1895  CONST_BITS);
1896  dataptr[1] = (DCTELEM)
1897  DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
1898  MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
1899  CONST_BITS);
1900 
1901  ctr++;
1902 
1903  if (ctr != DCTSIZE) {
1904  if (ctr == 14)
1905  break; /* Done. */
1906  dataptr += DCTSIZE; /* advance pointer to next row */
1907  } else
1908  dataptr = workspace; /* switch pointer to extended workspace */
1909  }
1910 
1911  /* Pass 2: process columns.
1912  * We leave the results scaled up by an overall factor of 8.
1913  * We must also scale the output by (8/14)**2 = 16/49, which we partially
1914  * fold into the constant multipliers and final shifting:
1915  * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
1916  */
1917 
1918  dataptr = data;
1919  wsptr = workspace;
1920  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1921  /* Even part */
1922 
1923  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
1924  tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
1925  tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
1926  tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
1927  tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
1928  tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
1929  tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
1930 
1931  tmp10 = tmp0 + tmp6;
1932  tmp14 = tmp0 - tmp6;
1933  tmp11 = tmp1 + tmp5;
1934  tmp15 = tmp1 - tmp5;
1935  tmp12 = tmp2 + tmp4;
1936  tmp16 = tmp2 - tmp4;
1937 
1938  tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
1939  tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
1940  tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
1941  tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
1942  tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
1943  tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
1944  tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
1945 
1946  dataptr[DCTSIZE*0] = (DCTELEM)
1947  DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
1948  FIX(0.653061224)), /* 32/49 */
1949  CONST_BITS+1);
1950  tmp13 += tmp13;
1951  dataptr[DCTSIZE*4] = (DCTELEM)
1952  DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
1953  MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
1954  MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
1955  CONST_BITS+1);
1956 
1957  tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
1958 
1959  dataptr[DCTSIZE*2] = (DCTELEM)
1960  DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
1961  + MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
1962  CONST_BITS+1);
1963  dataptr[DCTSIZE*6] = (DCTELEM)
1964  DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
1965  - MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
1966  CONST_BITS+1);
1967 
1968  /* Odd part */
1969 
1970  tmp10 = tmp1 + tmp2;
1971  tmp11 = tmp5 - tmp4;
1972  dataptr[DCTSIZE*7] = (DCTELEM)
1973  DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
1974  FIX(0.653061224)), /* 32/49 */
1975  CONST_BITS+1);
1976  tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
1977  tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
1978  tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
1979  tmp10 += tmp11 - tmp3;
1980  tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
1981  MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
1982  dataptr[DCTSIZE*5] = (DCTELEM)
1983  DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
1984  + MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
1985  CONST_BITS+1);
1986  tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
1987  MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
1988  dataptr[DCTSIZE*3] = (DCTELEM)
1989  DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
1990  - MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
1991  CONST_BITS+1);
1992  dataptr[DCTSIZE*1] = (DCTELEM)
1993  DESCALE(tmp11 + tmp12 + tmp3
1994  - MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
1995  - MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
1996  CONST_BITS+1);
1997 
1998  dataptr++; /* advance pointer to next column */
1999  wsptr++; /* advance pointer to next column */
2000  }
2001 }
2002 
2003 
2004 /*
2005  * Perform the forward DCT on a 15x15 sample block.
2006  */
2007 
2008 GLOBAL(void)
2009 jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2010 {
2011  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2012  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2013  INT32 z1, z2, z3;
2014  DCTELEM workspace[8*7];
2015  DCTELEM *dataptr;
2016  DCTELEM *wsptr;
2017  JSAMPROW elemptr;
2018  int ctr;
2019  SHIFT_TEMPS
2020 
2021  /* Pass 1: process rows.
2022  * Note results are scaled up by sqrt(8) compared to a true DCT.
2023  * cK represents sqrt(2) * cos(K*pi/30).
2024  */
2025 
2026  dataptr = data;
2027  ctr = 0;
2028  for (;;) {
2029  elemptr = sample_data[ctr] + start_col;
2030 
2031  /* Even part */
2032 
2033  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
2034  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
2035  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
2036  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
2037  tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
2038  tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
2039  tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
2040  tmp7 = GETJSAMPLE(elemptr[7]);
2041 
2042  tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
2043  tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
2044  tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
2045  tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
2046  tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
2047  tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
2048  tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
2049 
2050  z1 = tmp0 + tmp4 + tmp5;
2051  z2 = tmp1 + tmp3 + tmp6;
2052  z3 = tmp2 + tmp7;
2053  /* Apply unsigned->signed conversion. */
2054  dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
2055  z3 += z3;
2056  dataptr[6] = (DCTELEM)
2057  DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
2058  MULTIPLY(z2 - z3, FIX(0.437016024)), /* c12 */
2059  CONST_BITS);
2060  tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2061  z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) - /* c2+c14 */
2062  MULTIPLY(tmp6 - tmp2, FIX(2.238241955)); /* c4+c8 */
2063  z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) - /* c8-c14 */
2064  MULTIPLY(tmp0 - tmp2, FIX(0.091361227)); /* c2-c4 */
2065  z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) + /* c2 */
2066  MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) + /* c8 */
2067  MULTIPLY(tmp1 - tmp4, FIX(0.790569415)); /* (c6+c12)/2 */
2068 
2069  dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
2070  dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
2071 
2072  /* Odd part */
2073 
2074  tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2075  FIX(1.224744871)); /* c5 */
2076  tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
2077  MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876)); /* c9 */
2078  tmp12 = MULTIPLY(tmp12, FIX(1.224744871)); /* c5 */
2079  tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) + /* c1 */
2080  MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) + /* c3 */
2081  MULTIPLY(tmp13 + tmp15, FIX(0.575212477)); /* c11 */
2082  tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) - /* c7-c11 */
2083  MULTIPLY(tmp14, FIX(0.513743148)) + /* c3-c9 */
2084  MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12; /* c1+c13 */
2085  tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) - /* -(c1-c7) */
2086  MULTIPLY(tmp11, FIX(2.176250899)) - /* c3+c9 */
2087  MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12; /* c11+c13 */
2088 
2089  dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
2090  dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
2091  dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
2092  dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
2093 
2094  ctr++;
2095 
2096  if (ctr != DCTSIZE) {
2097  if (ctr == 15)
2098  break; /* Done. */
2099  dataptr += DCTSIZE; /* advance pointer to next row */
2100  } else
2101  dataptr = workspace; /* switch pointer to extended workspace */
2102  }
2103 
2104  /* Pass 2: process columns.
2105  * We leave the results scaled up by an overall factor of 8.
2106  * We must also scale the output by (8/15)**2 = 64/225, which we partially
2107  * fold into the constant multipliers and final shifting:
2108  * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
2109  */
2110 
2111  dataptr = data;
2112  wsptr = workspace;
2113  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2114  /* Even part */
2115 
2116  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
2117  tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
2118  tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
2119  tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
2120  tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
2121  tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
2122  tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
2123  tmp7 = dataptr[DCTSIZE*7];
2124 
2125  tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
2126  tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
2127  tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
2128  tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
2129  tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
2130  tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
2131  tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
2132 
2133  z1 = tmp0 + tmp4 + tmp5;
2134  z2 = tmp1 + tmp3 + tmp6;
2135  z3 = tmp2 + tmp7;
2136  dataptr[DCTSIZE*0] = (DCTELEM)
2137  DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
2138  CONST_BITS+2);
2139  z3 += z3;
2140  dataptr[DCTSIZE*6] = (DCTELEM)
2141  DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
2142  MULTIPLY(z2 - z3, FIX(0.497227121)), /* c12 */
2143  CONST_BITS+2);
2144  tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2145  z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) - /* c2+c14 */
2146  MULTIPLY(tmp6 - tmp2, FIX(2.546621957)); /* c4+c8 */
2147  z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) - /* c8-c14 */
2148  MULTIPLY(tmp0 - tmp2, FIX(0.103948774)); /* c2-c4 */
2149  z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) + /* c2 */
2150  MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) + /* c8 */
2151  MULTIPLY(tmp1 - tmp4, FIX(0.899492312)); /* (c6+c12)/2 */
2152 
2153  dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
2154  dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
2155 
2156  /* Odd part */
2157 
2158  tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2159  FIX(1.393487498)); /* c5 */
2160  tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
2161  MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187)); /* c9 */
2162  tmp12 = MULTIPLY(tmp12, FIX(1.393487498)); /* c5 */
2163  tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) + /* c1 */
2164  MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) + /* c3 */
2165  MULTIPLY(tmp13 + tmp15, FIX(0.654463974)); /* c11 */
2166  tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) - /* c7-c11 */
2167  MULTIPLY(tmp14, FIX(0.584525538)) + /* c3-c9 */
2168  MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12; /* c1+c13 */
2169  tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) - /* -(c1-c7) */
2170  MULTIPLY(tmp11, FIX(2.476089912)) - /* c3+c9 */
2171  MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12; /* c11+c13 */
2172 
2173  dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
2174  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
2175  dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
2176  dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
2177 
2178  dataptr++; /* advance pointer to next column */
2179  wsptr++; /* advance pointer to next column */
2180  }
2181 }
2182 
2183 
2184 /*
2185  * Perform the forward DCT on a 16x16 sample block.
2186  */
2187 
2188 GLOBAL(void)
2189 jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2190 {
2191  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2192  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2193  DCTELEM workspace[DCTSIZE2];
2194  DCTELEM *dataptr;
2195  DCTELEM *wsptr;
2196  JSAMPROW elemptr;
2197  int ctr;
2198  SHIFT_TEMPS
2199 
2200  /* Pass 1: process rows.
2201  * Note results are scaled up by sqrt(8) compared to a true DCT;
2202  * furthermore, we scale the results by 2**PASS1_BITS.
2203  * cK represents sqrt(2) * cos(K*pi/32).
2204  */
2205 
2206  dataptr = data;
2207  ctr = 0;
2208  for (;;) {
2209  elemptr = sample_data[ctr] + start_col;
2210 
2211  /* Even part */
2212 
2213  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2214  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2215  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2216  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2217  tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2218  tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2219  tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2220  tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2221 
2222  tmp10 = tmp0 + tmp7;
2223  tmp14 = tmp0 - tmp7;
2224  tmp11 = tmp1 + tmp6;
2225  tmp15 = tmp1 - tmp6;
2226  tmp12 = tmp2 + tmp5;
2227  tmp16 = tmp2 - tmp5;
2228  tmp13 = tmp3 + tmp4;
2229  tmp17 = tmp3 - tmp4;
2230 
2231  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2232  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2233  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2234  tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2235  tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2236  tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2237  tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2238  tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2239 
2240  /* Apply unsigned->signed conversion. */
2241  dataptr[0] = (DCTELEM)
2242  ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2243  dataptr[4] = (DCTELEM)
2244  DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2245  MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2246  CONST_BITS-PASS1_BITS);
2247 
2248  tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2249  MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2250 
2251  dataptr[2] = (DCTELEM)
2252  DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2253  + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
2254  CONST_BITS-PASS1_BITS);
2255  dataptr[6] = (DCTELEM)
2256  DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2257  - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2258  CONST_BITS-PASS1_BITS);
2259 
2260  /* Odd part */
2261 
2262  tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2263  MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2264  tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2265  MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2266  tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2267  MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2268  tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2269  MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2270  tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2271  MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2272  tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2273  MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2274  tmp10 = tmp11 + tmp12 + tmp13 -
2275  MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2276  MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2277  tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2278  - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2279  tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2280  + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2281  tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2282  + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2283 
2284  dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2285  dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2286  dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2287  dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2288 
2289  ctr++;
2290 
2291  if (ctr != DCTSIZE) {
2292  if (ctr == DCTSIZE * 2)
2293  break; /* Done. */
2294  dataptr += DCTSIZE; /* advance pointer to next row */
2295  } else
2296  dataptr = workspace; /* switch pointer to extended workspace */
2297  }
2298 
2299  /* Pass 2: process columns.
2300  * We remove the PASS1_BITS scaling, but leave the results scaled up
2301  * by an overall factor of 8.
2302  * We must also scale the output by (8/16)**2 = 1/2**2.
2303  * cK represents sqrt(2) * cos(K*pi/32).
2304  */
2305 
2306  dataptr = data;
2307  wsptr = workspace;
2308  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2309  /* Even part */
2310 
2311  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
2312  tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
2313  tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
2314  tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
2315  tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
2316  tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
2317  tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
2318  tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
2319 
2320  tmp10 = tmp0 + tmp7;
2321  tmp14 = tmp0 - tmp7;
2322  tmp11 = tmp1 + tmp6;
2323  tmp15 = tmp1 - tmp6;
2324  tmp12 = tmp2 + tmp5;
2325  tmp16 = tmp2 - tmp5;
2326  tmp13 = tmp3 + tmp4;
2327  tmp17 = tmp3 - tmp4;
2328 
2329  tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
2330  tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
2331  tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
2332  tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
2333  tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
2334  tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
2335  tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
2336  tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
2337 
2338  dataptr[DCTSIZE*0] = (DCTELEM)
2339  DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
2340  dataptr[DCTSIZE*4] = (DCTELEM)
2341  DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2342  MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2343  CONST_BITS+PASS1_BITS+2);
2344 
2345  tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2346  MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2347 
2348  dataptr[DCTSIZE*2] = (DCTELEM)
2349  DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2350  + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+10 */
2351  CONST_BITS+PASS1_BITS+2);
2352  dataptr[DCTSIZE*6] = (DCTELEM)
2353  DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2354  - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2355  CONST_BITS+PASS1_BITS+2);
2356 
2357  /* Odd part */
2358 
2359  tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2360  MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2361  tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2362  MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2363  tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2364  MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2365  tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2366  MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2367  tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2368  MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2369  tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2370  MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2371  tmp10 = tmp11 + tmp12 + tmp13 -
2372  MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2373  MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2374  tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2375  - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2376  tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2377  + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2378  tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2379  + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2380 
2381  dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
2382  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
2383  dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
2384  dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
2385 
2386  dataptr++; /* advance pointer to next column */
2387  wsptr++; /* advance pointer to next column */
2388  }
2389 }
2390 
2391 
2392 /*
2393  * Perform the forward DCT on a 16x8 sample block.
2394  *
2395  * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
2396  */
2397 
2398 GLOBAL(void)
2400 {
2401  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2402  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2403  INT32 z1;
2404  DCTELEM *dataptr;
2405  JSAMPROW elemptr;
2406  int ctr;
2407  SHIFT_TEMPS
2408 
2409  /* Pass 1: process rows.
2410  * Note results are scaled up by sqrt(8) compared to a true DCT;
2411  * furthermore, we scale the results by 2**PASS1_BITS.
2412  * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
2413  */
2414 
2415  dataptr = data;
2416  ctr = 0;
2417  for (ctr = 0; ctr < DCTSIZE; ctr++) {
2418  elemptr = sample_data[ctr] + start_col;
2419 
2420  /* Even part */
2421 
2422  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2423  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2424  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2425  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2426  tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2427  tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2428  tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2429  tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2430 
2431  tmp10 = tmp0 + tmp7;
2432  tmp14 = tmp0 - tmp7;
2433  tmp11 = tmp1 + tmp6;
2434  tmp15 = tmp1 - tmp6;
2435  tmp12 = tmp2 + tmp5;
2436  tmp16 = tmp2 - tmp5;
2437  tmp13 = tmp3 + tmp4;
2438  tmp17 = tmp3 - tmp4;
2439 
2440  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2441  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2442  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2443  tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2444  tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2445  tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2446  tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2447  tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2448 
2449  /* Apply unsigned->signed conversion. */
2450  dataptr[0] = (DCTELEM)
2451  ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2452  dataptr[4] = (DCTELEM)
2453  DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2454  MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2455  CONST_BITS-PASS1_BITS);
2456 
2457  tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2458  MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2459 
2460  dataptr[2] = (DCTELEM)
2461  DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2462  + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
2463  CONST_BITS-PASS1_BITS);
2464  dataptr[6] = (DCTELEM)
2465  DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2466  - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2467  CONST_BITS-PASS1_BITS);
2468 
2469  /* Odd part */
2470 
2471  tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2472  MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2473  tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2474  MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2475  tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2476  MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2477  tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2478  MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2479  tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2480  MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2481  tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2482  MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2483  tmp10 = tmp11 + tmp12 + tmp13 -
2484  MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2485  MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2486  tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2487  - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2488  tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2489  + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2490  tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2491  + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2492 
2493  dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2494  dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2495  dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2496  dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2497 
2498  dataptr += DCTSIZE; /* advance pointer to next row */
2499  }
2500 
2501  /* Pass 2: process columns.
2502  * We remove the PASS1_BITS scaling, but leave the results scaled up
2503  * by an overall factor of 8.
2504  * We must also scale the output by 8/16 = 1/2.
2505  * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2506  */
2507 
2508  dataptr = data;
2509  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2510  /* Even part per LL&M figure 1 --- note that published figure is faulty;
2511  * rotator "c1" should be "c6".
2512  */
2513 
2514  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
2515  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
2516  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
2517  tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
2518 
2519  tmp10 = tmp0 + tmp3;
2520  tmp12 = tmp0 - tmp3;
2521  tmp11 = tmp1 + tmp2;
2522  tmp13 = tmp1 - tmp2;
2523 
2524  tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
2525  tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
2526  tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
2527  tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
2528 
2529  dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
2530  dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
2531 
2532  z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
2533  dataptr[DCTSIZE*2] = (DCTELEM)
2534  DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
2535  CONST_BITS+PASS1_BITS+1);
2536  dataptr[DCTSIZE*6] = (DCTELEM)
2537  DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
2538  CONST_BITS+PASS1_BITS+1);
2539 
2540  /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
2541  * i0..i3 in the paper are tmp0..tmp3 here.
2542  */
2543 
2544  tmp12 = tmp0 + tmp2;
2545  tmp13 = tmp1 + tmp3;
2546 
2547  z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
2548  tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
2549  tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
2550  tmp12 += z1;
2551  tmp13 += z1;
2552 
2553  z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
2554  tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
2555  tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
2556  tmp0 += z1 + tmp12;
2557  tmp3 += z1 + tmp13;
2558 
2559  z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
2560  tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
2561  tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
2562  tmp1 += z1 + tmp13;
2563  tmp2 += z1 + tmp12;
2564 
2565  dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
2566  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
2567  dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
2568  dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+PASS1_BITS+1);
2569 
2570  dataptr++; /* advance pointer to next column */
2571  }
2572 }
2573 
2574 
2575 /*
2576  * Perform the forward DCT on a 14x7 sample block.
2577  *
2578  * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
2579  */
2580 
2581 GLOBAL(void)
2583 {
2584  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
2585  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2586  INT32 z1, z2, z3;
2587  DCTELEM *dataptr;
2588  JSAMPROW elemptr;
2589  int ctr;
2590  SHIFT_TEMPS
2591 
2592  /* Zero bottom row of output coefficient block. */
2594 
2595  /* Pass 1: process rows.
2596  * Note results are scaled up by sqrt(8) compared to a true DCT;
2597  * furthermore, we scale the results by 2**PASS1_BITS.
2598  * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
2599  */
2600 
2601  dataptr = data;
2602  for (ctr = 0; ctr < 7; ctr++) {
2603  elemptr = sample_data[ctr] + start_col;
2604 
2605  /* Even part */
2606 
2607  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
2608  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
2609  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
2610  tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
2611  tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
2612  tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
2613  tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
2614 
2615  tmp10 = tmp0 + tmp6;
2616  tmp14 = tmp0 - tmp6;
2617  tmp11 = tmp1 + tmp5;
2618  tmp15 = tmp1 - tmp5;
2619  tmp12 = tmp2 + tmp4;
2620  tmp16 = tmp2 - tmp4;
2621 
2622  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
2623  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
2624  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
2625  tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
2626  tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
2627  tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
2628  tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
2629 
2630  /* Apply unsigned->signed conversion. */
2631  dataptr[0] = (DCTELEM)
2632  ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
2633  tmp13 += tmp13;
2634  dataptr[4] = (DCTELEM)
2635  DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
2636  MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
2637  MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
2638  CONST_BITS-PASS1_BITS);
2639 
2640  tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
2641 
2642  dataptr[2] = (DCTELEM)
2643  DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
2644  + MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
2645  CONST_BITS-PASS1_BITS);
2646  dataptr[6] = (DCTELEM)
2647  DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
2648  - MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
2649  CONST_BITS-PASS1_BITS);
2650 
2651  /* Odd part */
2652 
2653  tmp10 = tmp1 + tmp2;
2654  tmp11 = tmp5 - tmp4;
2655  dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
2656  tmp3 <<= CONST_BITS;
2657  tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
2658  tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
2659  tmp10 += tmp11 - tmp3;
2660  tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
2661  MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
2662  dataptr[5] = (DCTELEM)
2663  DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
2664  + MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
2665  CONST_BITS-PASS1_BITS);
2666  tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
2667  MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
2668  dataptr[3] = (DCTELEM)
2669  DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
2670  - MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
2671  CONST_BITS-PASS1_BITS);
2672  dataptr[1] = (DCTELEM)
2673  DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
2674  MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
2675  CONST_BITS-PASS1_BITS);
2676 
2677  dataptr += DCTSIZE; /* advance pointer to next row */
2678  }
2679 
2680  /* Pass 2: process columns.
2681  * We remove the PASS1_BITS scaling, but leave the results scaled up
2682  * by an overall factor of 8.
2683  * We must also scale the output by (8/14)*(8/7) = 32/49, which we
2684  * partially fold into the constant multipliers and final shifting:
2685  * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
2686  */
2687 
2688  dataptr = data;
2689  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2690  /* Even part */
2691 
2692  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
2693  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
2694  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
2695  tmp3 = dataptr[DCTSIZE*3];
2696 
2697  tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
2698  tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
2699  tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
2700 
2701  z1 = tmp0 + tmp2;
2702  dataptr[DCTSIZE*0] = (DCTELEM)
2703  DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
2704  CONST_BITS+PASS1_BITS+1);
2705  tmp3 += tmp3;
2706  z1 -= tmp3;
2707  z1 -= tmp3;
2708  z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
2709  z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
2710  z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
2711  dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
2712  z1 -= z2;
2713  z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
2714  dataptr[DCTSIZE*4] = (DCTELEM)
2715  DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
2716  CONST_BITS+PASS1_BITS+1);
2717  dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
2718 
2719  /* Odd part */
2720 
2721  tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
2722  tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
2723  tmp0 = tmp1 - tmp2;
2724  tmp1 += tmp2;
2725  tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
2726  tmp1 += tmp2;
2727  tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
2728  tmp0 += tmp3;
2729  tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
2730 
2731  dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
2732  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
2733  dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
2734 
2735  dataptr++; /* advance pointer to next column */
2736  }
2737 }
2738 
2739 
2740 /*
2741  * Perform the forward DCT on a 12x6 sample block.
2742  *
2743  * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
2744  */
2745 
2746 GLOBAL(void)
2748 {
2749  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
2750  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2751  DCTELEM *dataptr;
2752  JSAMPROW elemptr;
2753  int ctr;
2754  SHIFT_TEMPS
2755 
2756  /* Zero 2 bottom rows of output coefficient block. */
2757  MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
2758 
2759  /* Pass 1: process rows.
2760  * Note results are scaled up by sqrt(8) compared to a true DCT;
2761  * furthermore, we scale the results by 2**PASS1_BITS.
2762  * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
2763  */
2764 
2765  dataptr = data;
2766  for (ctr = 0; ctr < 6; ctr++) {
2767  elemptr = sample_data[ctr] + start_col;
2768 
2769  /* Even part */
2770 
2771  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
2772  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
2773  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
2774  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
2775  tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
2776  tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
2777 
2778  tmp10 = tmp0 + tmp5;
2779  tmp13 = tmp0 - tmp5;
2780  tmp11 = tmp1 + tmp4;
2781  tmp14 = tmp1 - tmp4;
2782  tmp12 = tmp2 + tmp3;
2783  tmp15 = tmp2 - tmp3;
2784 
2785  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
2786  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
2787  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
2788  tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
2789  tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
2790  tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
2791 
2792  /* Apply unsigned->signed conversion. */
2793  dataptr[0] = (DCTELEM)
2794  ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
2795  dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
2796  dataptr[4] = (DCTELEM)
2797  DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
2798  CONST_BITS-PASS1_BITS);
2799  dataptr[2] = (DCTELEM)
2800  DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
2801  CONST_BITS-PASS1_BITS);
2802 
2803  /* Odd part */
2804 
2805  tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
2806  tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
2807  tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
2808  tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
2809  tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
2810  tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
2811  + MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
2812  tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
2813  tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
2814  + MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
2815  tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
2816  - MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
2817  tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
2818  - MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
2819 
2820  dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2821  dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2822  dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2823  dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2824 
2825  dataptr += DCTSIZE; /* advance pointer to next row */
2826  }
2827 
2828  /* Pass 2: process columns.
2829  * We remove the PASS1_BITS scaling, but leave the results scaled up
2830  * by an overall factor of 8.
2831  * We must also scale the output by (8/12)*(8/6) = 8/9, which we
2832  * partially fold into the constant multipliers and final shifting:
2833  * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
2834  */
2835 
2836  dataptr = data;
2837  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2838  /* Even part */
2839 
2840  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
2841  tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
2842  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
2843 
2844  tmp10 = tmp0 + tmp2;
2845  tmp12 = tmp0 - tmp2;
2846 
2847  tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
2848  tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
2849  tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
2850 
2851  dataptr[DCTSIZE*0] = (DCTELEM)
2852  DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
2853  CONST_BITS+PASS1_BITS+1);
2854  dataptr[DCTSIZE*2] = (DCTELEM)
2855  DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
2856  CONST_BITS+PASS1_BITS+1);
2857  dataptr[DCTSIZE*4] = (DCTELEM)
2858  DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
2859  CONST_BITS+PASS1_BITS+1);
2860 
2861  /* Odd part */
2862 
2863  tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
2864 
2865  dataptr[DCTSIZE*1] = (DCTELEM)
2866  DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
2867  CONST_BITS+PASS1_BITS+1);
2868  dataptr[DCTSIZE*3] = (DCTELEM)
2869  DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
2870  CONST_BITS+PASS1_BITS+1);
2871  dataptr[DCTSIZE*5] = (DCTELEM)
2872  DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
2873  CONST_BITS+PASS1_BITS+1);
2874 
2875  dataptr++; /* advance pointer to next column */
2876  }
2877 }
2878 
2879 
2880 /*
2881  * Perform the forward DCT on a 10x5 sample block.
2882  *
2883  * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
2884  */
2885 
2886 GLOBAL(void)
2888 {
2889  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
2890  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
2891  DCTELEM *dataptr;
2892  JSAMPROW elemptr;
2893  int ctr;
2894  SHIFT_TEMPS
2895 
2896  /* Zero 3 bottom rows of output coefficient block. */
2897  MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
2898 
2899  /* Pass 1: process rows.
2900  * Note results are scaled up by sqrt(8) compared to a true DCT;
2901  * furthermore, we scale the results by 2**PASS1_BITS.
2902  * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
2903  */
2904 
2905  dataptr = data;
2906  for (ctr = 0; ctr < 5; ctr++) {
2907  elemptr = sample_data[ctr] + start_col;
2908 
2909  /* Even part */
2910 
2911  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
2912  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
2913  tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
2914  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
2915  tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
2916 
2917  tmp10 = tmp0 + tmp4;
2918  tmp13 = tmp0 - tmp4;
2919  tmp11 = tmp1 + tmp3;
2920  tmp14 = tmp1 - tmp3;
2921 
2922  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
2923  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
2924  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
2925  tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
2926  tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
2927 
2928  /* Apply unsigned->signed conversion. */
2929  dataptr[0] = (DCTELEM)
2930  ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
2931  tmp12 += tmp12;
2932  dataptr[4] = (DCTELEM)
2933  DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
2934  MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
2935  CONST_BITS-PASS1_BITS);
2936  tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
2937  dataptr[2] = (DCTELEM)
2938  DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
2939  CONST_BITS-PASS1_BITS);
2940  dataptr[6] = (DCTELEM)
2941  DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
2942  CONST_BITS-PASS1_BITS);
2943 
2944  /* Odd part */
2945 
2946  tmp10 = tmp0 + tmp4;
2947  tmp11 = tmp1 - tmp3;
2948  dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
2949  tmp2 <<= CONST_BITS;
2950  dataptr[1] = (DCTELEM)
2951  DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
2952  MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
2953  MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
2954  MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
2955  CONST_BITS-PASS1_BITS);
2956  tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
2957  MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
2958  tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
2959  (tmp11 << (CONST_BITS - 1)) - tmp2;
2960  dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
2961  dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
2962 
2963  dataptr += DCTSIZE; /* advance pointer to next row */
2964  }
2965 
2966  /* Pass 2: process columns.
2967  * We remove the PASS1_BITS scaling, but leave the results scaled up
2968  * by an overall factor of 8.
2969  * We must also scale the output by (8/10)*(8/5) = 32/25, which we
2970  * fold into the constant multipliers:
2971  * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
2972  */
2973 
2974  dataptr = data;
2975  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2976  /* Even part */
2977 
2978  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
2979  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
2980  tmp2 = dataptr[DCTSIZE*2];
2981 
2982  tmp10 = tmp0 + tmp1;
2983  tmp11 = tmp0 - tmp1;
2984 
2985  tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
2986  tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
2987 
2988  dataptr[DCTSIZE*0] = (DCTELEM)
2989  DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
2990  CONST_BITS+PASS1_BITS);
2991  tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
2992  tmp10 -= tmp2 << 2;
2993  tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
2994  dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
2995  dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
2996 
2997  /* Odd part */
2998 
2999  tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
3000 
3001  dataptr[DCTSIZE*1] = (DCTELEM)
3002  DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
3003  CONST_BITS+PASS1_BITS);
3004  dataptr[DCTSIZE*3] = (DCTELEM)
3005  DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
3006  CONST_BITS+PASS1_BITS);
3007 
3008  dataptr++; /* advance pointer to next column */
3009  }
3010 }
3011 
3012 
3013 /*
3014  * Perform the forward DCT on an 8x4 sample block.
3015  *
3016  * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
3017  */
3018 
3019 GLOBAL(void)
3021 {
3022  INT32 tmp0, tmp1, tmp2, tmp3;
3023  INT32 tmp10, tmp11, tmp12, tmp13;
3024  INT32 z1;
3025  DCTELEM *dataptr;
3026  JSAMPROW elemptr;
3027  int ctr;
3028  SHIFT_TEMPS
3029 
3030  /* Zero 4 bottom rows of output coefficient block. */
3031  MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
3032 
3033  /* Pass 1: process rows.
3034  * Note results are scaled up by sqrt(8) compared to a true DCT;
3035  * furthermore, we scale the results by 2**PASS1_BITS.
3036  * We must also scale the output by 8/4 = 2, which we add here.
3037  * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3038  */
3039 
3040  dataptr = data;
3041  for (ctr = 0; ctr < 4; ctr++) {
3042  elemptr = sample_data[ctr] + start_col;
3043 
3044  /* Even part per LL&M figure 1 --- note that published figure is faulty;
3045  * rotator "c1" should be "c6".
3046  */
3047 
3048  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3049  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3050  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3051  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3052 
3053  tmp10 = tmp0 + tmp3;
3054  tmp12 = tmp0 - tmp3;
3055  tmp11 = tmp1 + tmp2;
3056  tmp13 = tmp1 - tmp2;
3057 
3058  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3059  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3060  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3061  tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3062 
3063  /* Apply unsigned->signed conversion. */
3064  dataptr[0] = (DCTELEM)
3065  ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
3066  dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
3067 
3068  z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
3069  /* Add fudge factor here for final descale. */
3070  z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3071 
3072  dataptr[2] = (DCTELEM)
3073  RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3074  CONST_BITS-PASS1_BITS-1);
3075  dataptr[6] = (DCTELEM)
3076  RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3077  CONST_BITS-PASS1_BITS-1);
3078 
3079  /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3080  * i0..i3 in the paper are tmp0..tmp3 here.
3081  */
3082 
3083  tmp12 = tmp0 + tmp2;
3084  tmp13 = tmp1 + tmp3;
3085 
3086  z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
3087  /* Add fudge factor here for final descale. */
3088  z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3089 
3090  tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
3091  tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
3092  tmp12 += z1;
3093  tmp13 += z1;
3094 
3095  z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
3096  tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
3097  tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
3098  tmp0 += z1 + tmp12;
3099  tmp3 += z1 + tmp13;
3100 
3101  z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
3102  tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
3103  tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
3104  tmp1 += z1 + tmp13;
3105  tmp2 += z1 + tmp12;
3106 
3107  dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS-1);
3108  dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS-1);
3109  dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS-1);
3110  dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS-1);
3111 
3112  dataptr += DCTSIZE; /* advance pointer to next row */
3113  }
3114 
3115  /* Pass 2: process columns.
3116  * We remove the PASS1_BITS scaling, but leave the results scaled up
3117  * by an overall factor of 8.
3118  * 4-point FDCT kernel,
3119  * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3120  */
3121 
3122  dataptr = data;
3123  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3124  /* Even part */
3125 
3126  /* Add fudge factor here for final descale. */
3127  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
3128  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
3129 
3130  tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
3131  tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
3132 
3133  dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3134  dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3135 
3136  /* Odd part */
3137 
3138  tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
3139  /* Add fudge factor here for final descale. */
3140  tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
3141 
3142  dataptr[DCTSIZE*1] = (DCTELEM)
3143  RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3144  CONST_BITS+PASS1_BITS);
3145  dataptr[DCTSIZE*3] = (DCTELEM)
3146  RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3147  CONST_BITS+PASS1_BITS);
3148 
3149  dataptr++; /* advance pointer to next column */
3150  }
3151 }
3152 
3153 
3154 /*
3155  * Perform the forward DCT on a 6x3 sample block.
3156  *
3157  * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
3158  */
3159 
3160 GLOBAL(void)
3162 {
3163  INT32 tmp0, tmp1, tmp2;
3164  INT32 tmp10, tmp11, tmp12;
3165  DCTELEM *dataptr;
3166  JSAMPROW elemptr;
3167  int ctr;
3168  SHIFT_TEMPS
3169 
3170  /* Pre-zero output coefficient block. */
3172 
3173  /* Pass 1: process rows.
3174  * Note results are scaled up by sqrt(8) compared to a true DCT;
3175  * furthermore, we scale the results by 2**PASS1_BITS.
3176  * We scale the results further by 2 as part of output adaption
3177  * scaling for different DCT size.
3178  * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3179  */
3180 
3181  dataptr = data;
3182  for (ctr = 0; ctr < 3; ctr++) {
3183  elemptr = sample_data[ctr] + start_col;
3184 
3185  /* Even part */
3186 
3187  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3188  tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3189  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3190 
3191  tmp10 = tmp0 + tmp2;
3192  tmp12 = tmp0 - tmp2;
3193 
3194  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3195  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3196  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3197 
3198  /* Apply unsigned->signed conversion. */
3199  dataptr[0] = (DCTELEM)
3200  ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
3201  dataptr[2] = (DCTELEM)
3202  DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
3203  CONST_BITS-PASS1_BITS-1);
3204  dataptr[4] = (DCTELEM)
3205  DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3206  CONST_BITS-PASS1_BITS-1);
3207 
3208  /* Odd part */
3209 
3210  tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
3211  CONST_BITS-PASS1_BITS-1);
3212 
3213  dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
3214  dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
3215  dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
3216 
3217  dataptr += DCTSIZE; /* advance pointer to next row */
3218  }
3219 
3220  /* Pass 2: process columns.
3221  * We remove the PASS1_BITS scaling, but leave the results scaled up
3222  * by an overall factor of 8.
3223  * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
3224  * fold into the constant multipliers (other part was done in pass 1):
3225  * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
3226  */
3227 
3228  dataptr = data;
3229  for (ctr = 0; ctr < 6; ctr++) {
3230  /* Even part */
3231 
3232  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
3233  tmp1 = dataptr[DCTSIZE*1];
3234 
3235  tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
3236 
3237  dataptr[DCTSIZE*0] = (DCTELEM)
3238  DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
3239  CONST_BITS+PASS1_BITS);
3240  dataptr[DCTSIZE*2] = (DCTELEM)
3241  DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
3242  CONST_BITS+PASS1_BITS);
3243 
3244  /* Odd part */
3245 
3246  dataptr[DCTSIZE*1] = (DCTELEM)
3247  DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
3248  CONST_BITS+PASS1_BITS);
3249 
3250  dataptr++; /* advance pointer to next column */
3251  }
3252 }
3253 
3254 
3255 /*
3256  * Perform the forward DCT on a 4x2 sample block.
3257  *
3258  * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
3259  */
3260 
3261 GLOBAL(void)
3263 {
3264  INT32 tmp0, tmp1;
3265  INT32 tmp10, tmp11;
3266  DCTELEM *dataptr;
3267  JSAMPROW elemptr;
3268  int ctr;
3269  SHIFT_TEMPS
3270 
3271  /* Pre-zero output coefficient block. */
3273 
3274  /* Pass 1: process rows.
3275  * Note results are scaled up by sqrt(8) compared to a true DCT;
3276  * furthermore, we scale the results by 2**PASS1_BITS.
3277  * We must also scale the output by (8/4)*(8/2) = 2**3, which we add here.
3278  * 4-point FDCT kernel,
3279  * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3280  */
3281 
3282  dataptr = data;
3283  for (ctr = 0; ctr < 2; ctr++) {
3284  elemptr = sample_data[ctr] + start_col;
3285 
3286  /* Even part */
3287 
3288  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3289  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3290 
3291  tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3292  tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3293 
3294  /* Apply unsigned->signed conversion. */
3295  dataptr[0] = (DCTELEM)
3296  ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
3297  dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
3298 
3299  /* Odd part */
3300 
3301  tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
3302  /* Add fudge factor here for final descale. */
3303  tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
3304 
3305  dataptr[1] = (DCTELEM)
3306  RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3307  CONST_BITS-PASS1_BITS-3);
3308  dataptr[3] = (DCTELEM)
3309  RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3310  CONST_BITS-PASS1_BITS-3);
3311 
3312  dataptr += DCTSIZE; /* advance pointer to next row */
3313  }
3314 
3315  /* Pass 2: process columns.
3316  * We remove the PASS1_BITS scaling, but leave the results scaled up
3317  * by an overall factor of 8.
3318  */
3319 
3320  dataptr = data;
3321  for (ctr = 0; ctr < 4; ctr++) {
3322  /* Even part */
3323 
3324  /* Add fudge factor here for final descale. */
3325  tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
3326  tmp1 = dataptr[DCTSIZE*1];
3327 
3328  dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3329 
3330  /* Odd part */
3331 
3332  dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3333 
3334  dataptr++; /* advance pointer to next column */
3335  }
3336 }
3337 
3338 
3339 /*
3340  * Perform the forward DCT on a 2x1 sample block.
3341  *
3342  * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
3343  */
3344 
3345 GLOBAL(void)
3347 {
3348  DCTELEM tmp0, tmp1;
3349  JSAMPROW elemptr;
3350 
3351  /* Pre-zero output coefficient block. */
3353 
3354  elemptr = sample_data[0] + start_col;
3355 
3356  tmp0 = GETJSAMPLE(elemptr[0]);
3357  tmp1 = GETJSAMPLE(elemptr[1]);
3358 
3359  /* We leave the results scaled up by an overall factor of 8.
3360  * We must also scale the output by (8/2)*(8/1) = 2**5.
3361  */
3362 
3363  /* Even part */
3364 
3365  /* Apply unsigned->signed conversion. */
3366  data[0] = (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5;
3367 
3368  /* Odd part */
3369 
3370  data[1] = (tmp0 - tmp1) << 5;
3371 }
3372 
3373 
3374 /*
3375  * Perform the forward DCT on an 8x16 sample block.
3376  *
3377  * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
3378  */
3379 
3380 GLOBAL(void)
3382 {
3383  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3384  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
3385  INT32 z1;
3386  DCTELEM workspace[DCTSIZE2];
3387  DCTELEM *dataptr;
3388  DCTELEM *wsptr;
3389  JSAMPROW elemptr;
3390  int ctr;
3391  SHIFT_TEMPS
3392 
3393  /* Pass 1: process rows.
3394  * Note results are scaled up by sqrt(8) compared to a true DCT;
3395  * furthermore, we scale the results by 2**PASS1_BITS.
3396  * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3397  */
3398 
3399  dataptr = data;
3400  ctr = 0;
3401  for (;;) {
3402  elemptr = sample_data[ctr] + start_col;
3403 
3404  /* Even part per LL&M figure 1 --- note that published figure is faulty;
3405  * rotator "c1" should be "c6".
3406  */
3407 
3408  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3409  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3410  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3411  tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3412 
3413  tmp10 = tmp0 + tmp3;
3414  tmp12 = tmp0 - tmp3;
3415  tmp11 = tmp1 + tmp2;
3416  tmp13 = tmp1 - tmp2;
3417 
3418  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3419  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3420  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3421  tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3422 
3423  /* Apply unsigned->signed conversion. */
3424  dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
3425  dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
3426 
3427  z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
3428  dataptr[2] = (DCTELEM)
3429  DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3430  CONST_BITS-PASS1_BITS);
3431  dataptr[6] = (DCTELEM)
3432  DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3433  CONST_BITS-PASS1_BITS);
3434 
3435  /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3436  * i0..i3 in the paper are tmp0..tmp3 here.
3437  */
3438 
3439  tmp12 = tmp0 + tmp2;
3440  tmp13 = tmp1 + tmp3;
3441 
3442  z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
3443  tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
3444  tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
3445  tmp12 += z1;
3446  tmp13 += z1;
3447 
3448  z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
3449  tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
3450  tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
3451  tmp0 += z1 + tmp12;
3452  tmp3 += z1 + tmp13;
3453 
3454  z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
3455  tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
3456  tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
3457  tmp1 += z1 + tmp13;
3458  tmp2 += z1 + tmp12;
3459 
3460  dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3461  dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3462  dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3463  dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-PASS1_BITS);
3464 
3465  ctr++;
3466 
3467  if (ctr != DCTSIZE) {
3468  if (ctr == DCTSIZE * 2)
3469  break; /* Done. */
3470  dataptr += DCTSIZE; /* advance pointer to next row */
3471  } else
3472  dataptr = workspace; /* switch pointer to extended workspace */
3473  }
3474 
3475  /* Pass 2: process columns.
3476  * We remove the PASS1_BITS scaling, but leave the results scaled up
3477  * by an overall factor of 8.
3478  * We must also scale the output by 8/16 = 1/2.
3479  * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
3480  */
3481 
3482  dataptr = data;
3483  wsptr = workspace;
3484  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3485  /* Even part */
3486 
3487  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
3488  tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
3489  tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
3490  tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
3491  tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
3492  tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
3493  tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
3494  tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
3495 
3496  tmp10 = tmp0 + tmp7;
3497  tmp14 = tmp0 - tmp7;
3498  tmp11 = tmp1 + tmp6;
3499  tmp15 = tmp1 - tmp6;
3500  tmp12 = tmp2 + tmp5;
3501  tmp16 = tmp2 - tmp5;
3502  tmp13 = tmp3 + tmp4;
3503  tmp17 = tmp3 - tmp4;
3504 
3505  tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
3506  tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
3507  tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
3508  tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
3509  tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
3510  tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
3511  tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
3512  tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
3513 
3514  dataptr[DCTSIZE*0] = (DCTELEM)
3515  DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
3516  dataptr[DCTSIZE*4] = (DCTELEM)
3517  DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
3518  MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
3519  CONST_BITS+PASS1_BITS+1);
3520 
3521  tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
3522  MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
3523 
3524  dataptr[DCTSIZE*2] = (DCTELEM)
3525  DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
3526  + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
3527  CONST_BITS+PASS1_BITS+1);
3528  dataptr[DCTSIZE*6] = (DCTELEM)
3529  DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
3530  - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
3531  CONST_BITS+PASS1_BITS+1);
3532 
3533  /* Odd part */
3534 
3535  tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
3536  MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
3537  tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
3538  MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
3539  tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
3540  MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
3541  tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
3542  MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
3543  tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
3544  MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
3545  tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
3546  MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
3547  tmp10 = tmp11 + tmp12 + tmp13 -
3548  MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
3549  MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
3550  tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
3551  - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
3552  tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
3553  + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
3554  tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
3555  + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
3556 
3557  dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
3558  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
3559  dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
3560  dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
3561 
3562  dataptr++; /* advance pointer to next column */
3563  wsptr++; /* advance pointer to next column */
3564  }
3565 }
3566 
3567 
3568 /*
3569  * Perform the forward DCT on a 7x14 sample block.
3570  *
3571  * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
3572  */
3573 
3574 GLOBAL(void)
3576 {
3577  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
3578  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3579  INT32 z1, z2, z3;
3580  DCTELEM workspace[8*6];
3581  DCTELEM *dataptr;
3582  DCTELEM *wsptr;
3583  JSAMPROW elemptr;
3584  int ctr;
3585  SHIFT_TEMPS
3586 
3587  /* Pre-zero output coefficient block. */
3589 
3590  /* Pass 1: process rows.
3591  * Note results are scaled up by sqrt(8) compared to a true DCT;
3592  * furthermore, we scale the results by 2**PASS1_BITS.
3593  * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
3594  */
3595 
3596  dataptr = data;
3597  ctr = 0;
3598  for (;;) {
3599  elemptr = sample_data[ctr] + start_col;
3600 
3601  /* Even part */
3602 
3603  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
3604  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
3605  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
3606  tmp3 = GETJSAMPLE(elemptr[3]);
3607 
3608  tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
3609  tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
3610  tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
3611 
3612  z1 = tmp0 + tmp2;
3613  /* Apply unsigned->signed conversion. */
3614  dataptr[0] = (DCTELEM)
3615  ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
3616  tmp3 += tmp3;
3617  z1 -= tmp3;
3618  z1 -= tmp3;
3619  z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
3620  z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
3621  z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
3622  dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
3623  z1 -= z2;
3624  z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
3625  dataptr[4] = (DCTELEM)
3626  DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
3627  CONST_BITS-PASS1_BITS);
3628  dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
3629 
3630  /* Odd part */
3631 
3632  tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
3633  tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
3634  tmp0 = tmp1 - tmp2;
3635  tmp1 += tmp2;
3636  tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
3637  tmp1 += tmp2;
3638  tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
3639  tmp0 += tmp3;
3640  tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
3641 
3642  dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3643  dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3644  dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3645 
3646  ctr++;
3647 
3648  if (ctr != DCTSIZE) {
3649  if (ctr == 14)
3650  break; /* Done. */
3651  dataptr += DCTSIZE; /* advance pointer to next row */
3652  } else
3653  dataptr = workspace; /* switch pointer to extended workspace */
3654  }
3655 
3656  /* Pass 2: process columns.
3657  * We remove the PASS1_BITS scaling, but leave the results scaled up
3658  * by an overall factor of 8.
3659  * We must also scale the output by (8/7)*(8/14) = 32/49, which we
3660  * fold into the constant multipliers:
3661  * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
3662  */
3663 
3664  dataptr = data;
3665  wsptr = workspace;
3666  for (ctr = 0; ctr < 7; ctr++) {
3667  /* Even part */
3668 
3669  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
3670  tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
3671  tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
3672  tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
3673  tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
3674  tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
3675  tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
3676 
3677  tmp10 = tmp0 + tmp6;
3678  tmp14 = tmp0 - tmp6;
3679  tmp11 = tmp1 + tmp5;
3680  tmp15 = tmp1 - tmp5;
3681  tmp12 = tmp2 + tmp4;
3682  tmp16 = tmp2 - tmp4;
3683 
3684  tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
3685  tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
3686  tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
3687  tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
3688  tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
3689  tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
3690  tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
3691 
3692  dataptr[DCTSIZE*0] = (DCTELEM)
3693  DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
3694  FIX(0.653061224)), /* 32/49 */
3695  CONST_BITS+PASS1_BITS);
3696  tmp13 += tmp13;
3697  dataptr[DCTSIZE*4] = (DCTELEM)
3698  DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
3699  MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
3700  MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
3701  CONST_BITS+PASS1_BITS);
3702 
3703  tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
3704 
3705  dataptr[DCTSIZE*2] = (DCTELEM)
3706  DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
3707  + MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
3708  CONST_BITS+PASS1_BITS);
3709  dataptr[DCTSIZE*6] = (DCTELEM)
3710  DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
3711  - MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
3712  CONST_BITS+PASS1_BITS);
3713 
3714  /* Odd part */
3715 
3716  tmp10 = tmp1 + tmp2;
3717  tmp11 = tmp5 - tmp4;
3718  dataptr[DCTSIZE*7] = (DCTELEM)
3719  DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
3720  FIX(0.653061224)), /* 32/49 */
3721  CONST_BITS+PASS1_BITS);
3722  tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
3723  tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
3724  tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
3725  tmp10 += tmp11 - tmp3;
3726  tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
3727  MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
3728  dataptr[DCTSIZE*5] = (DCTELEM)
3729  DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
3730  + MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
3731  CONST_BITS+PASS1_BITS);
3732  tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
3733  MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
3734  dataptr[DCTSIZE*3] = (DCTELEM)
3735  DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
3736  - MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
3737  CONST_BITS+PASS1_BITS);
3738  dataptr[DCTSIZE*1] = (DCTELEM)
3739  DESCALE(tmp11 + tmp12 + tmp3
3740  - MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
3741  - MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
3742  CONST_BITS+PASS1_BITS);
3743 
3744  dataptr++; /* advance pointer to next column */
3745  wsptr++; /* advance pointer to next column */
3746  }
3747 }
3748 
3749 
3750 /*
3751  * Perform the forward DCT on a 6x12 sample block.
3752  *
3753  * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
3754  */
3755 
3756 GLOBAL(void)
3758 {
3759  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3760  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3761  DCTELEM workspace[8*4];
3762  DCTELEM *dataptr;
3763  DCTELEM *wsptr;
3764  JSAMPROW elemptr;
3765  int ctr;
3766  SHIFT_TEMPS
3767 
3768  /* Pre-zero output coefficient block. */
3770 
3771  /* Pass 1: process rows.
3772  * Note results are scaled up by sqrt(8) compared to a true DCT;
3773  * furthermore, we scale the results by 2**PASS1_BITS.
3774  * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3775  */
3776 
3777  dataptr = data;
3778  ctr = 0;
3779  for (;;) {
3780  elemptr = sample_data[ctr] + start_col;
3781 
3782  /* Even part */
3783 
3784  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3785  tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3786  tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3787 
3788  tmp10 = tmp0 + tmp2;
3789  tmp12 = tmp0 - tmp2;
3790 
3791  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3792  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3793  tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3794 
3795  /* Apply unsigned->signed conversion. */
3796  dataptr[0] = (DCTELEM)
3797  ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
3798  dataptr[2] = (DCTELEM)
3799  DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
3800  CONST_BITS-PASS1_BITS);
3801  dataptr[4] = (DCTELEM)
3802  DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3803  CONST_BITS-PASS1_BITS);
3804 
3805  /* Odd part */
3806 
3807  tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
3808  CONST_BITS-PASS1_BITS);
3809 
3810  dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
3811  dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
3812  dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
3813 
3814  ctr++;
3815 
3816  if (ctr != DCTSIZE) {
3817  if (ctr == 12)
3818  break; /* Done. */
3819  dataptr += DCTSIZE; /* advance pointer to next row */
3820  } else
3821  dataptr = workspace; /* switch pointer to extended workspace */
3822  }
3823 
3824  /* Pass 2: process columns.
3825  * We remove the PASS1_BITS scaling, but leave the results scaled up
3826  * by an overall factor of 8.
3827  * We must also scale the output by (8/6)*(8/12) = 8/9, which we
3828  * fold into the constant multipliers:
3829  * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
3830  */
3831 
3832  dataptr = data;
3833  wsptr = workspace;
3834  for (ctr = 0; ctr < 6; ctr++) {
3835  /* Even part */
3836 
3837  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
3838  tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
3839  tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
3840  tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
3841  tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
3842  tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
3843 
3844  tmp10 = tmp0 + tmp5;
3845  tmp13 = tmp0 - tmp5;
3846  tmp11 = tmp1 + tmp4;
3847  tmp14 = tmp1 - tmp4;
3848  tmp12 = tmp2 + tmp3;
3849  tmp15 = tmp2 - tmp3;
3850 
3851  tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
3852  tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
3853  tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
3854  tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
3855  tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
3856  tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
3857 
3858  dataptr[DCTSIZE*0] = (DCTELEM)
3859  DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
3860  CONST_BITS+PASS1_BITS);
3861  dataptr[DCTSIZE*6] = (DCTELEM)
3862  DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
3863  CONST_BITS+PASS1_BITS);
3864  dataptr[DCTSIZE*4] = (DCTELEM)
3865  DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
3866  CONST_BITS+PASS1_BITS);
3867  dataptr[DCTSIZE*2] = (DCTELEM)
3868  DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
3869  MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
3870  CONST_BITS+PASS1_BITS);
3871 
3872  /* Odd part */
3873 
3874  tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
3875  tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
3876  tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
3877  tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
3878  tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
3879  tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
3880  + MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
3881  tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
3882  tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
3883  + MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
3884  tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
3885  - MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
3886  tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
3887  - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
3888 
3889  dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
3890  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
3891  dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
3892  dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
3893 
3894  dataptr++; /* advance pointer to next column */
3895  wsptr++; /* advance pointer to next column */
3896  }
3897 }
3898 
3899 
3900 /*
3901  * Perform the forward DCT on a 5x10 sample block.
3902  *
3903  * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
3904  */
3905 
3906 GLOBAL(void)
3908 {
3909  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
3910  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3911  DCTELEM workspace[8*2];
3912  DCTELEM *dataptr;
3913  DCTELEM *wsptr;
3914  JSAMPROW elemptr;
3915  int ctr;
3916  SHIFT_TEMPS
3917 
3918  /* Pre-zero output coefficient block. */
3920 
3921  /* Pass 1: process rows.
3922  * Note results are scaled up by sqrt(8) compared to a true DCT;
3923  * furthermore, we scale the results by 2**PASS1_BITS.
3924  * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
3925  */
3926 
3927  dataptr = data;
3928  ctr = 0;
3929  for (;;) {
3930  elemptr = sample_data[ctr] + start_col;
3931 
3932  /* Even part */
3933 
3934  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
3935  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
3936  tmp2 = GETJSAMPLE(elemptr[2]);
3937 
3938  tmp10 = tmp0 + tmp1;
3939  tmp11 = tmp0 - tmp1;
3940 
3941  tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
3942  tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
3943 
3944  /* Apply unsigned->signed conversion. */
3945  dataptr[0] = (DCTELEM)
3946  ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
3947  tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
3948  tmp10 -= tmp2 << 2;
3949  tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
3950  dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
3951  dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
3952 
3953  /* Odd part */
3954 
3955  tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
3956 
3957  dataptr[1] = (DCTELEM)
3958  DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
3959  CONST_BITS-PASS1_BITS);
3960  dataptr[3] = (DCTELEM)
3961  DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
3962  CONST_BITS-PASS1_BITS);
3963 
3964  ctr++;
3965 
3966  if (ctr != DCTSIZE) {
3967  if (ctr == 10)
3968  break; /* Done. */
3969  dataptr += DCTSIZE; /* advance pointer to next row */
3970  } else
3971  dataptr = workspace; /* switch pointer to extended workspace */
3972  }
3973 
3974  /* Pass 2: process columns.
3975  * We remove the PASS1_BITS scaling, but leave the results scaled up
3976  * by an overall factor of 8.
3977  * We must also scale the output by (8/5)*(8/10) = 32/25, which we
3978  * fold into the constant multipliers:
3979  * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
3980  */
3981 
3982  dataptr = data;
3983  wsptr = workspace;
3984  for (ctr = 0; ctr < 5; ctr++) {
3985  /* Even part */
3986 
3987  tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
3988  tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
3989  tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
3990  tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
3991  tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
3992 
3993  tmp10 = tmp0 + tmp4;
3994  tmp13 = tmp0 - tmp4;
3995  tmp11 = tmp1 + tmp3;
3996  tmp14 = tmp1 - tmp3;
3997 
3998  tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
3999  tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
4000  tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
4001  tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
4002  tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
4003 
4004  dataptr[DCTSIZE*0] = (DCTELEM)
4005  DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
4006  CONST_BITS+PASS1_BITS);
4007  tmp12 += tmp12;
4008  dataptr[DCTSIZE*4] = (DCTELEM)
4009  DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
4010  MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
4011  CONST_BITS+PASS1_BITS);
4012  tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
4013  dataptr[DCTSIZE*2] = (DCTELEM)
4014  DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
4015  CONST_BITS+PASS1_BITS);
4016  dataptr[DCTSIZE*6] = (DCTELEM)
4017  DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
4018  CONST_BITS+PASS1_BITS);
4019 
4020  /* Odd part */
4021 
4022  tmp10 = tmp0 + tmp4;
4023  tmp11 = tmp1 - tmp3;
4024  dataptr[DCTSIZE*5] = (DCTELEM)
4025  DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
4026  CONST_BITS+PASS1_BITS);
4027  tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
4028  dataptr[DCTSIZE*1] = (DCTELEM)
4029  DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
4030  MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
4031  MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
4032  MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
4033  CONST_BITS+PASS1_BITS);
4034  tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
4035  MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
4036  tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
4037  MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
4038  dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
4039  dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
4040 
4041  dataptr++; /* advance pointer to next column */
4042  wsptr++; /* advance pointer to next column */
4043  }
4044 }
4045 
4046 
4047 /*
4048  * Perform the forward DCT on a 4x8 sample block.
4049  *
4050  * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
4051  */
4052 
4053 GLOBAL(void)
4055 {
4056  INT32 tmp0, tmp1, tmp2, tmp3;
4057  INT32 tmp10, tmp11, tmp12, tmp13;
4058  INT32 z1;
4059  DCTELEM *dataptr;
4060  JSAMPROW elemptr;
4061  int ctr;
4062  SHIFT_TEMPS
4063 
4064  /* Pre-zero output coefficient block. */
4066 
4067  /* Pass 1: process rows.
4068  * Note results are scaled up by sqrt(8) compared to a true DCT;
4069  * furthermore, we scale the results by 2**PASS1_BITS.
4070  * We must also scale the output by 8/4 = 2, which we add here.
4071  * 4-point FDCT kernel,
4072  * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4073  */
4074 
4075  dataptr = data;
4076  for (ctr = 0; ctr < DCTSIZE; ctr++) {
4077  elemptr = sample_data[ctr] + start_col;
4078 
4079  /* Even part */
4080 
4081  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
4082  tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
4083 
4084  tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
4085  tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
4086 
4087  /* Apply unsigned->signed conversion. */
4088  dataptr[0] = (DCTELEM)
4089  ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
4090  dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
4091 
4092  /* Odd part */
4093 
4094  tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
4095  /* Add fudge factor here for final descale. */
4096  tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
4097 
4098  dataptr[1] = (DCTELEM)
4099  RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4100  CONST_BITS-PASS1_BITS-1);
4101  dataptr[3] = (DCTELEM)
4102  RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4103  CONST_BITS-PASS1_BITS-1);
4104 
4105  dataptr += DCTSIZE; /* advance pointer to next row */
4106  }
4107 
4108  /* Pass 2: process columns.
4109  * We remove the PASS1_BITS scaling, but leave the results scaled up
4110  * by an overall factor of 8.
4111  * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4112  */
4113 
4114  dataptr = data;
4115  for (ctr = 0; ctr < 4; ctr++) {
4116  /* Even part per LL&M figure 1 --- note that published figure is faulty;
4117  * rotator "c1" should be "c6".
4118  */
4119 
4120  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
4121  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
4122  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
4123  tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
4124 
4125  /* Add fudge factor here for final descale. */
4126  tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
4127  tmp12 = tmp0 - tmp3;
4128  tmp11 = tmp1 + tmp2;
4129  tmp13 = tmp1 - tmp2;
4130 
4131  tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
4132  tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
4133  tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
4134  tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
4135 
4136  dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
4137  dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
4138 
4139  z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
4140  /* Add fudge factor here for final descale. */
4141  z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4142 
4143  dataptr[DCTSIZE*2] = (DCTELEM)
4144  RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
4145  CONST_BITS+PASS1_BITS);
4146  dataptr[DCTSIZE*6] = (DCTELEM)
4147  RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
4148  CONST_BITS+PASS1_BITS);
4149 
4150  /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
4151  * i0..i3 in the paper are tmp0..tmp3 here.
4152  */
4153 
4154  tmp12 = tmp0 + tmp2;
4155  tmp13 = tmp1 + tmp3;
4156 
4157  z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
4158  /* Add fudge factor here for final descale. */
4159  z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4160 
4161  tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
4162  tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
4163  tmp12 += z1;
4164  tmp13 += z1;
4165 
4166  z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
4167  tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
4168  tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
4169  tmp0 += z1 + tmp12;
4170  tmp3 += z1 + tmp13;
4171 
4172  z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
4173  tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
4174  tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
4175  tmp1 += z1 + tmp13;
4176  tmp2 += z1 + tmp12;
4177 
4178  dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS1_BITS);
4179  dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS1_BITS);
4180  dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS1_BITS);
4181  dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS1_BITS);
4182 
4183  dataptr++; /* advance pointer to next column */
4184  }
4185 }
4186 
4187 
4188 /*
4189  * Perform the forward DCT on a 3x6 sample block.
4190  *
4191  * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
4192  */
4193 
4194 GLOBAL(void)
4196 {
4197  INT32 tmp0, tmp1, tmp2;
4198  INT32 tmp10, tmp11, tmp12;
4199  DCTELEM *dataptr;
4200  JSAMPROW elemptr;
4201  int ctr;
4202  SHIFT_TEMPS
4203 
4204  /* Pre-zero output coefficient block. */
4206 
4207  /* Pass 1: process rows.
4208  * Note results are scaled up by sqrt(8) compared to a true DCT;
4209  * furthermore, we scale the results by 2**PASS1_BITS.
4210  * We scale the results further by 2 as part of output adaption
4211  * scaling for different DCT size.
4212  * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
4213  */
4214 
4215  dataptr = data;
4216  for (ctr = 0; ctr < 6; ctr++) {
4217  elemptr = sample_data[ctr] + start_col;
4218 
4219  /* Even part */
4220 
4221  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
4222  tmp1 = GETJSAMPLE(elemptr[1]);
4223 
4224  tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
4225 
4226  /* Apply unsigned->signed conversion. */
4227  dataptr[0] = (DCTELEM)
4228  ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
4229  dataptr[2] = (DCTELEM)
4230  DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
4231  CONST_BITS-PASS1_BITS-1);
4232 
4233  /* Odd part */
4234 
4235  dataptr[1] = (DCTELEM)
4236  DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
4237  CONST_BITS-PASS1_BITS-1);
4238 
4239  dataptr += DCTSIZE; /* advance pointer to next row */
4240  }
4241 
4242  /* Pass 2: process columns.
4243  * We remove the PASS1_BITS scaling, but leave the results scaled up
4244  * by an overall factor of 8.
4245  * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
4246  * fold into the constant multipliers (other part was done in pass 1):
4247  * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
4248  */
4249 
4250  dataptr = data;
4251  for (ctr = 0; ctr < 3; ctr++) {
4252  /* Even part */
4253 
4254  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
4255  tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
4256  tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
4257 
4258  tmp10 = tmp0 + tmp2;
4259  tmp12 = tmp0 - tmp2;
4260 
4261  tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
4262  tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
4263  tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
4264 
4265  dataptr[DCTSIZE*0] = (DCTELEM)
4266  DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
4267  CONST_BITS+PASS1_BITS);
4268  dataptr[DCTSIZE*2] = (DCTELEM)
4269  DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
4270  CONST_BITS+PASS1_BITS);
4271  dataptr[DCTSIZE*4] = (DCTELEM)
4272  DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
4273  CONST_BITS+PASS1_BITS);
4274 
4275  /* Odd part */
4276 
4277  tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
4278 
4279  dataptr[DCTSIZE*1] = (DCTELEM)
4280  DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
4281  CONST_BITS+PASS1_BITS);
4282  dataptr[DCTSIZE*3] = (DCTELEM)
4283  DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
4284  CONST_BITS+PASS1_BITS);
4285  dataptr[DCTSIZE*5] = (DCTELEM)
4286  DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
4287  CONST_BITS+PASS1_BITS);
4288 
4289  dataptr++; /* advance pointer to next column */
4290  }
4291 }
4292 
4293 
4294 /*
4295  * Perform the forward DCT on a 2x4 sample block.
4296  *
4297  * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
4298  */
4299 
4300 GLOBAL(void)
4302 {
4303  INT32 tmp0, tmp1;
4304  INT32 tmp10, tmp11;
4305  DCTELEM *dataptr;
4306  JSAMPROW elemptr;
4307  int ctr;
4308  SHIFT_TEMPS
4309 
4310  /* Pre-zero output coefficient block. */
4312 
4313  /* Pass 1: process rows.
4314  * Note results are scaled up by sqrt(8) compared to a true DCT.
4315  * We must also scale the output by (8/2)*(8/4) = 2**3, which we add here.
4316  */
4317 
4318  dataptr = data;
4319  for (ctr = 0; ctr < 4; ctr++) {
4320  elemptr = sample_data[ctr] + start_col;
4321 
4322  /* Even part */
4323 
4324  tmp0 = GETJSAMPLE(elemptr[0]);
4325  tmp1 = GETJSAMPLE(elemptr[1]);
4326 
4327  /* Apply unsigned->signed conversion. */
4328  dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
4329 
4330  /* Odd part */
4331 
4332  dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
4333 
4334  dataptr += DCTSIZE; /* advance pointer to next row */
4335  }
4336 
4337  /* Pass 2: process columns.
4338  * We leave the results scaled up by an overall factor of 8.
4339  * 4-point FDCT kernel,
4340  * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4341  */
4342 
4343  dataptr = data;
4344  for (ctr = 0; ctr < 2; ctr++) {
4345  /* Even part */
4346 
4347  tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
4348  tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
4349 
4350  tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
4351  tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
4352 
4353  dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
4354  dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
4355 
4356  /* Odd part */
4357 
4358  tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
4359  /* Add fudge factor here for final descale. */
4360  tmp0 += ONE << (CONST_BITS-1);
4361 
4362  dataptr[DCTSIZE*1] = (DCTELEM)
4363  RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4364  CONST_BITS);
4365  dataptr[DCTSIZE*3] = (DCTELEM)
4366  RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4367  CONST_BITS);
4368 
4369  dataptr++; /* advance pointer to next column */
4370  }
4371 }
4372 
4373 
4374 /*
4375  * Perform the forward DCT on a 1x2 sample block.
4376  *
4377  * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
4378  */
4379 
4380 GLOBAL(void)
4382 {
4383  DCTELEM tmp0, tmp1;
4384 
4385  /* Pre-zero output coefficient block. */
4387 
4388  /* Pass 1: empty. */
4389 
4390  /* Pass 2: process columns.
4391  * We leave the results scaled up by an overall factor of 8.
4392  * We must also scale the output by (8/1)*(8/2) = 2**5.
4393  */
4394 
4395  /* Even part */
4396 
4397  tmp0 = GETJSAMPLE(sample_data[0][start_col]);
4398  tmp1 = GETJSAMPLE(sample_data[1][start_col]);
4399 
4400  /* Apply unsigned->signed conversion. */
4401  data[DCTSIZE*0] = (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5;
4402 
4403  /* Odd part */
4404 
4405  data[DCTSIZE*1] = (tmp0 - tmp1) << 5;
4406 }
4407 
4408 #endif /* DCT_SCALING_SUPPORTED */
4409 #endif /* DCT_ISLOW_SUPPORTED */
#define DESCALE(x, n)
Definition: jdct.h:366
#define CENTERJSAMPLE
Definition: jmorecfg.h:84
JSAMPLE FAR * JSAMPROW
Definition: jpeglib.h:75
JSAMPARRAY JDIMENSION start_col
Definition: jdct.h:169
INT32 DCTELEM
Definition: jdct.h:38
#define GETJSAMPLE(value)
Definition: jmorecfg.h:78
#define MEMZERO(addr, type, size)
Definition: svc_dg.c:324
#define SHIFT_TEMPS
Definition: jpegint.h:301
#define SIZEOF(_ar)
Definition: calc.h:97
Definition: texture.c:1663
static int blocks
Definition: mkdosfs.c:527
#define DCTSIZE2
Definition: jpeglib.h:51
GLint GLenum GLsizei GLsizei GLsizei GLint GLsizei const GLvoid * data
Definition: gl.h:1950
JSAMPROW * JSAMPARRAY
Definition: jpeglib.h:76
#define GLOBAL(type)
Definition: jmorecfg.h:291
#define FIX(x)
Definition: jccolor.c:74
#define RIGHT_SHIFT(x, shft)
Definition: jpegint.h:302
#define DCTSIZE
Definition: jpeglib.h:50
unsigned int JDIMENSION
Definition: jmorecfg.h:229
int const JOCTET * dataptr
Definition: jpeglib.h:1027
signed int INT32
JSAMPARRAY sample_data
Definition: jdct.h:169
Sorry
Definition: jdcolor.c:19