ReactOS 0.4.16-dev-2617-g01a0906
jfdctint.c
Go to the documentation of this file.
1/*
2 * jfdctint.c
3 *
4 * Copyright (C) 1991-1996, Thomas G. Lane.
5 * Modification developed 2003-2026 by Guido Vollbeding.
6 * This file is part of the Independent JPEG Group's software.
7 * For conditions of distribution and use, see the accompanying README file.
8 *
9 * This file contains a slow-but-accurate integer implementation of the
10 * forward DCT (Discrete Cosine Transform).
11 *
12 * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
13 * on each column. Direct algorithms are also available, but they are
14 * much more complex and seem not to be any faster when reduced to code.
15 *
16 * This implementation is based on an algorithm described in
17 * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
18 * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
19 * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
20 * The primary algorithm described there uses 11 multiplies and 29 adds.
21 * We use their alternate method with 12 multiplies and 32 adds.
22 * The advantage of this method is that no data path contains more than one
23 * multiplication; this allows a very simple and accurate implementation in
24 * scaled fixed-point arithmetic, with a minimal number of shifts.
25 *
26 * We also provide FDCT routines with various input sample block sizes for
27 * direct resolution reduction or enlargement and for direct resolving the
28 * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
29 * (N=1...16), 2NxN, and Nx2N (N=1...8) samples for one 8x8 output DCT block.
30 *
31 * For N<8 we fill the remaining block coefficients with zero.
32 * For N>8 we apply a partial N-point FDCT on the input samples, computing
33 * just the lower 8 frequency coefficients and discarding the rest.
34 *
35 * We must scale the output coefficients of the N-point FDCT appropriately
36 * to the standard 8-point FDCT level by 8/N per 1-D pass. This scaling
37 * is folded into the constant multipliers (pass 2) and/or final/initial
38 * shifting.
39 *
40 * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
41 * since there would be too many additional constants to pre-calculate.
42 */
43
44#define JPEG_INTERNALS
45#include "jinclude.h"
46#include "jpeglib.h"
47#include "jdct.h" /* Private declarations for DCT subsystem */
48
49#ifdef DCT_ISLOW_SUPPORTED
50
51
52/*
53 * This module is specialized to the case DCTSIZE = 8.
54 */
55
56#if DCTSIZE != 8
57 Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
58#endif
59
60
61/*
62 * The poop on this scaling stuff is as follows:
63 *
64 * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
65 * larger than the true DCT outputs. The final outputs are therefore
66 * a factor of N larger than desired; since N=8 this can be cured by
67 * a simple right shift at the end of the algorithm. The advantage of
68 * this arrangement is that we save two multiplications per 1-D DCT,
69 * because the y0 and y4 outputs need not be divided by sqrt(N).
70 * In the IJG code, this factor of 8 is removed by the quantization step
71 * (in jcdctmgr.c), NOT in this module.
72 *
73 * We have to do addition and subtraction of the integer inputs, which
74 * is no problem, and multiplication by fractional constants, which is
75 * a problem to do in integer arithmetic. We multiply all the constants
76 * by CONST_SCALE and convert them to integer constants (thus retaining
77 * CONST_BITS bits of precision in the constants). After doing a
78 * multiplication we have to divide the product by CONST_SCALE, with
79 * proper rounding, to produce the correct output. This division can
80 * be done cheaply as a right shift of CONST_BITS bits. We postpone
81 * shifting as long as possible so that partial sums can be added
82 * together with full fractional precision.
83 *
84 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
85 * they are represented to better-than-integral precision. These outputs
86 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit
87 * word with the recommended scaling. (For higher bit depths, the
88 * intermediate array is INT32 anyway.)
89 *
90 * To avoid overflow of the 32-bit intermediate results in pass 2, we
91 * must have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error
92 * analysis shows that the values given below are the most effective.
93 */
94
95#if BITS_IN_JSAMPLE <= 10 && JPEG_DATA_PRECISION <= 10
96#define CONST_BITS 13
97#define PASS1_BITS (10 - BITS_IN_JSAMPLE)
98#define PASS2_BITS (10 - JPEG_DATA_PRECISION)
99#else
100#if BITS_IN_JSAMPLE <= 13 && JPEG_DATA_PRECISION <= 13
101#define CONST_BITS 13
102#define PASS1_BITS (13 - BITS_IN_JSAMPLE)
103#define PASS2_BITS (13 - JPEG_DATA_PRECISION)
104#endif
105#endif
106
107/* Some C compilers fail to reduce "FIX(constant)" at compile time,
108 * thus causing a lot of useless floating-point operations at run time.
109 * To get around this we use the following pre-calculated constants.
110 * If you change CONST_BITS you may want to add appropriate values.
111 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
112 */
113
114#if CONST_BITS == 13
115#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
116#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
117#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
118#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
119#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
120#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
121#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
122#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
123#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
124#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
125#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
126#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
127#else
128#define FIX_0_298631336 FIX(0.298631336)
129#define FIX_0_390180644 FIX(0.390180644)
130#define FIX_0_541196100 FIX(0.541196100)
131#define FIX_0_765366865 FIX(0.765366865)
132#define FIX_0_899976223 FIX(0.899976223)
133#define FIX_1_175875602 FIX(1.175875602)
134#define FIX_1_501321110 FIX(1.501321110)
135#define FIX_1_847759065 FIX(1.847759065)
136#define FIX_1_961570560 FIX(1.961570560)
137#define FIX_2_053119869 FIX(2.053119869)
138#define FIX_2_562915447 FIX(2.562915447)
139#define FIX_3_072711026 FIX(3.072711026)
140#endif
141
142
143/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
144 * For up to 10-bit data with the recommended scaling, all the variable
145 * and constant values involved are no more than 16 bits wide, so a
146 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
147 * For higher bit depths, a full 32-bit multiplication will be needed.
148 */
149
150#if BITS_IN_JSAMPLE <= 10 && JPEG_DATA_PRECISION <= 10
151#define MULTIPLY(var,const) MULTIPLY16C16(var,const)
152#else
153#define MULTIPLY(var,const) ((var) * (const))
154#endif
155
156
157/* Pass 1 output: smart scale up. */
158
159#if PASS1_BITS > 0
160#define PASS1_OUTPUT(x) (DCTELEM) ((x) << PASS1_BITS)
161#else
162#define PASS1_OUTPUT(x) (DCTELEM) (x)
163#endif
164
165
166/* Pass 2 output: smart scale down. */
167
168#if PASS2_BITS > 0
169#define PASS2_OUTPUT(x) (DCTELEM) RIGHT_SHIFT(x, PASS2_BITS)
170#else
171#define PASS2_OUTPUT(x) (DCTELEM) (x)
172#endif
173
174
175/*
176 * Perform the forward DCT on one block of samples.
177 */
178
179GLOBAL(void)
181{
182 INT32 tmp0, tmp1, tmp2, tmp3;
183 INT32 tmp10, tmp11, tmp12, tmp13;
184 INT32 z1;
186 JSAMPROW elemptr;
187 int ctr;
189
190 /* Pass 1: process rows.
191 * Note results are scaled up by sqrt(8) compared to a true DCT;
192 * furthermore, we scale the results by 2**PASS1_BITS.
193 * cK represents sqrt(2) * cos(K*pi/16).
194 */
195
196 dataptr = data;
197 for (ctr = 0; ctr < DCTSIZE; ctr++) {
198 elemptr = sample_data[ctr] + start_col;
199
200 /* Even part per LL&M figure 1 --- note that published figure is faulty;
201 * rotator "c1" should be "c6".
202 */
203
204 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
205 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
206 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
207 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
208
209 tmp10 = tmp0 + tmp3;
210 tmp12 = tmp0 - tmp3;
211 tmp11 = tmp1 + tmp2;
212 tmp13 = tmp1 - tmp2;
213
214 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
215 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
216 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
217 tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
218
219 /* Apply unsigned->signed conversion. */
220 dataptr[0] = PASS1_OUTPUT(tmp10 + tmp11 - 8 * CENTERJSAMPLE);
221 dataptr[4] = PASS1_OUTPUT(tmp10 - tmp11);
222
223 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
224 /* Add fudge factor here for final descale. */
225 z1 += ONE << (CONST_BITS-PASS1_BITS-1);
226
227 dataptr[2] = (DCTELEM)
228 RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
229 CONST_BITS-PASS1_BITS);
230 dataptr[6] = (DCTELEM)
231 RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
232 CONST_BITS-PASS1_BITS);
233
234 /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
235 * i0..i3 in the paper are tmp0..tmp3 here.
236 */
237
238 tmp12 = tmp0 + tmp2;
239 tmp13 = tmp1 + tmp3;
240
241 z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
242 /* Add fudge factor here for final descale. */
243 z1 += ONE << (CONST_BITS-PASS1_BITS-1);
244
245 tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
246 tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
247 tmp12 += z1;
248 tmp13 += z1;
249
250 z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
251 tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
252 tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
253 tmp0 += z1 + tmp12;
254 tmp3 += z1 + tmp13;
255
256 z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
257 tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
258 tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
259 tmp1 += z1 + tmp13;
260 tmp2 += z1 + tmp12;
261
262 dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS);
263 dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS);
264 dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
265 dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS);
266
267 dataptr += DCTSIZE; /* advance pointer to next row */
268 }
269
270 /* Pass 2: process columns.
271 * We apply the PASS2_BITS scaling, but leave the
272 * results scaled up by an overall factor of 8.
273 * cK represents sqrt(2) * cos(K*pi/16).
274 */
275
276 dataptr = data;
277 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
278 /* Even part per LL&M figure 1 --- note that published figure is faulty;
279 * rotator "c1" should be "c6".
280 */
281
282 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
283 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
284 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
285 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
286
287 /* Add fudge factor here for final descale. */
288#if PASS2_BITS > 1
289 tmp10 = tmp0 + tmp3 + (ONE << (PASS2_BITS-1));
290#else
291#if PASS2_BITS > 0
292 tmp10 = tmp0 + tmp3 + ONE;
293#else
294 tmp10 = tmp0 + tmp3;
295#endif
296#endif
297 tmp12 = tmp0 - tmp3;
298 tmp11 = tmp1 + tmp2;
299 tmp13 = tmp1 - tmp2;
300
301 tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
302 tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
303 tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
304 tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
305
306 dataptr[DCTSIZE*0] = PASS2_OUTPUT(tmp10 + tmp11);
307 dataptr[DCTSIZE*4] = PASS2_OUTPUT(tmp10 - tmp11);
308
309 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
310 /* Add fudge factor here for final descale. */
311 z1 += ONE << (CONST_BITS+PASS2_BITS-1);
312
313 dataptr[DCTSIZE*2] = (DCTELEM)
314 RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
315 CONST_BITS+PASS2_BITS);
316 dataptr[DCTSIZE*6] = (DCTELEM)
317 RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
318 CONST_BITS+PASS2_BITS);
319
320 /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
321 * i0..i3 in the paper are tmp0..tmp3 here.
322 */
323
324 tmp12 = tmp0 + tmp2;
325 tmp13 = tmp1 + tmp3;
326
327 z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
328 /* Add fudge factor here for final descale. */
329 z1 += ONE << (CONST_BITS+PASS2_BITS-1);
330
331 tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
332 tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
333 tmp12 += z1;
334 tmp13 += z1;
335
336 z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
337 tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
338 tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
339 tmp0 += z1 + tmp12;
340 tmp3 += z1 + tmp13;
341
342 z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
343 tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
344 tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
345 tmp1 += z1 + tmp13;
346 tmp2 += z1 + tmp12;
347
348 dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS2_BITS);
349 dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS2_BITS);
350 dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS2_BITS);
351 dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS2_BITS);
352
353 dataptr++; /* advance pointer to next column */
354 }
355}
356
357#ifdef DCT_SCALING_SUPPORTED
358
359
360/*
361 * Perform the forward DCT on a 7x7 sample block.
362 */
363
364GLOBAL(void)
366{
367 INT32 tmp0, tmp1, tmp2, tmp3;
368 INT32 tmp10, tmp11, tmp12;
369 INT32 z1, z2, z3;
371 JSAMPROW elemptr;
372 int ctr;
374
375 /* Pre-zero output coefficient block. */
377
378 /* Pass 1: process rows.
379 * Note results are scaled up by sqrt(8) compared to a true DCT;
380 * furthermore, we scale the results by 2**PASS1_BITS.
381 * cK represents sqrt(2) * cos(K*pi/14).
382 */
383
384 dataptr = data;
385 for (ctr = 0; ctr < 7; ctr++) {
386 elemptr = sample_data[ctr] + start_col;
387
388 /* Even part */
389
390 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
391 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
392 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
393 tmp3 = GETJSAMPLE(elemptr[3]);
394
395 tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
396 tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
397 tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
398
399 z1 = tmp0 + tmp2;
400 /* Apply unsigned->signed conversion. */
401 dataptr[0] = PASS1_OUTPUT(z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE);
402 tmp3 += tmp3;
403 z1 -= tmp3;
404 z1 -= tmp3;
405 z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
406 z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
407 z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
408 dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
409 z1 -= z2;
410 z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
411 dataptr[4] = (DCTELEM)
412 DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
413 CONST_BITS-PASS1_BITS);
414 dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
415
416 /* Odd part */
417
418 tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
419 tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
420 tmp0 = tmp1 - tmp2;
421 tmp1 += tmp2;
422 tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
423 tmp1 += tmp2;
424 tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
425 tmp0 += tmp3;
426 tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
427
428 dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
429 dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
430 dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
431
432 dataptr += DCTSIZE; /* advance pointer to next row */
433 }
434
435 /* Pass 2: process columns.
436 * We apply the PASS2_BITS scaling, but leave the
437 * results scaled up by an overall factor of 8.
438 * We must also scale the output by (8/7)**2 = 64/49,
439 * which we fold into the constant multipliers:
440 * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
441 */
442
443 dataptr = data;
444 for (ctr = 0; ctr < 7; ctr++) {
445 /* Even part */
446
447 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
448 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
449 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
450 tmp3 = dataptr[DCTSIZE*3];
451
452 tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
453 tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
454 tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
455
456 z1 = tmp0 + tmp2;
457 dataptr[DCTSIZE*0] = (DCTELEM)
458 DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
459 CONST_BITS+PASS2_BITS);
460 tmp3 += tmp3;
461 z1 -= tmp3;
462 z1 -= tmp3;
463 z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
464 z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
465 z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
466 dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS2_BITS);
467 z1 -= z2;
468 z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
469 dataptr[DCTSIZE*4] = (DCTELEM)
470 DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
471 CONST_BITS+PASS2_BITS);
472 dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS2_BITS);
473
474 /* Odd part */
475
476 tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
477 tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
478 tmp0 = tmp1 - tmp2;
479 tmp1 += tmp2;
480 tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
481 tmp1 += tmp2;
482 tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
483 tmp0 += tmp3;
484 tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
485
486 dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS2_BITS);
487 dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS2_BITS);
488 dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS2_BITS);
489
490 dataptr++; /* advance pointer to next column */
491 }
492}
493
494
495/*
496 * Perform the forward DCT on a 6x6 sample block.
497 */
498
499GLOBAL(void)
501{
502 INT32 tmp0, tmp1, tmp2;
503 INT32 tmp10, tmp11, tmp12;
505 JSAMPROW elemptr;
506 int ctr;
508
509 /* Pre-zero output coefficient block. */
511
512 /* Pass 1: process rows.
513 * Note results are scaled up by sqrt(8) compared to a true DCT;
514 * furthermore, we scale the results by 2**PASS1_BITS.
515 * cK represents sqrt(2) * cos(K*pi/12).
516 */
517
518 dataptr = data;
519 for (ctr = 0; ctr < 6; ctr++) {
520 elemptr = sample_data[ctr] + start_col;
521
522 /* Even part */
523
524 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
525 tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
526 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
527
528 tmp10 = tmp0 + tmp2;
529 tmp12 = tmp0 - tmp2;
530
531 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
532 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
533 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
534
535 /* Apply unsigned->signed conversion. */
536 dataptr[0] = PASS1_OUTPUT(tmp10 + tmp11 - 6 * CENTERJSAMPLE);
537 dataptr[2] = (DCTELEM)
538 DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
539 CONST_BITS-PASS1_BITS);
540 dataptr[4] = (DCTELEM)
541 DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
542 CONST_BITS-PASS1_BITS);
543
544 /* Odd part */
545
546 tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
547 CONST_BITS-PASS1_BITS);
548
549#if PASS1_BITS > 0
550 dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
551 dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
552 dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
553#else
554 dataptr[1] = (DCTELEM) (tmp10 + tmp0 + tmp1);
555 dataptr[3] = (DCTELEM) (tmp0 - tmp1 - tmp2);
556 dataptr[5] = (DCTELEM) (tmp10 + tmp2 - tmp1);
557#endif
558
559 dataptr += DCTSIZE; /* advance pointer to next row */
560 }
561
562 /* Pass 2: process columns.
563 * We apply the PASS2_BITS scaling, but leave the
564 * results scaled up by an overall factor of 8.
565 * We must also scale the output by (8/6)**2 = 16/9,
566 * which we fold into the constant multipliers:
567 * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
568 */
569
570 dataptr = data;
571 for (ctr = 0; ctr < 6; ctr++) {
572 /* Even part */
573
574 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
575 tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
576 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
577
578 tmp10 = tmp0 + tmp2;
579 tmp12 = tmp0 - tmp2;
580
581 tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
582 tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
583 tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
584
585 dataptr[DCTSIZE*0] = (DCTELEM)
586 DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
587 CONST_BITS+PASS2_BITS);
588 dataptr[DCTSIZE*2] = (DCTELEM)
589 DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
590 CONST_BITS+PASS2_BITS);
591 dataptr[DCTSIZE*4] = (DCTELEM)
592 DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
593 CONST_BITS+PASS2_BITS);
594
595 /* Odd part */
596
597 tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
598
599 dataptr[DCTSIZE*1] = (DCTELEM)
600 DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
601 CONST_BITS+PASS2_BITS);
602 dataptr[DCTSIZE*3] = (DCTELEM)
603 DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
604 CONST_BITS+PASS2_BITS);
605 dataptr[DCTSIZE*5] = (DCTELEM)
606 DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
607 CONST_BITS+PASS2_BITS);
608
609 dataptr++; /* advance pointer to next column */
610 }
611}
612
613
614/*
615 * Perform the forward DCT on a 5x5 sample block.
616 */
617
618GLOBAL(void)
620{
621 INT32 tmp0, tmp1, tmp2;
622 INT32 tmp10, tmp11;
624 JSAMPROW elemptr;
625 int ctr;
627
628 /* Pre-zero output coefficient block. */
630
631 /* Pass 1: process rows.
632 * Note results are scaled up by sqrt(8) compared to a true DCT;
633 * furthermore, we scale the results by 2**PASS1_BITS.
634 * We scale the results further by 2 as part of output adaption
635 * scaling for different DCT size.
636 * cK represents sqrt(2) * cos(K*pi/10).
637 */
638
639 dataptr = data;
640 for (ctr = 0; ctr < 5; ctr++) {
641 elemptr = sample_data[ctr] + start_col;
642
643 /* Even part */
644
645 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
646 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
647 tmp2 = GETJSAMPLE(elemptr[2]);
648
649 tmp10 = tmp0 + tmp1;
650 tmp11 = tmp0 - tmp1;
651
652 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
653 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
654
655 /* Apply unsigned->signed conversion. */
656 dataptr[0] = (DCTELEM)
657 ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
658 tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
659 tmp10 -= tmp2 << 2;
660 tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
661 dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
662 dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
663
664 /* Odd part */
665
666 tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
667
668 dataptr[1] = (DCTELEM)
669 DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
670 CONST_BITS-PASS1_BITS-1);
671 dataptr[3] = (DCTELEM)
672 DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
673 CONST_BITS-PASS1_BITS-1);
674
675 dataptr += DCTSIZE; /* advance pointer to next row */
676 }
677
678 /* Pass 2: process columns.
679 * We apply the PASS2_BITS scaling, but leave the
680 * results scaled up by an overall factor of 8.
681 * We must also scale the output by (8/5)**2 = 64/25, which we partially
682 * fold into the constant multipliers (other part was done in pass 1):
683 * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
684 */
685
686 dataptr = data;
687 for (ctr = 0; ctr < 5; ctr++) {
688 /* Even part */
689
690 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
691 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
692 tmp2 = dataptr[DCTSIZE*2];
693
694 tmp10 = tmp0 + tmp1;
695 tmp11 = tmp0 - tmp1;
696
697 tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
698 tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
699
700 dataptr[DCTSIZE*0] = (DCTELEM)
701 DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
702 CONST_BITS+PASS2_BITS);
703 tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
704 tmp10 -= tmp2 << 2;
705 tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
706 dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS2_BITS);
707 dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS2_BITS);
708
709 /* Odd part */
710
711 tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
712
713 dataptr[DCTSIZE*1] = (DCTELEM)
714 DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
715 CONST_BITS+PASS2_BITS);
716 dataptr[DCTSIZE*3] = (DCTELEM)
717 DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
718 CONST_BITS+PASS2_BITS);
719
720 dataptr++; /* advance pointer to next column */
721 }
722}
723
724
725/*
726 * Perform the forward DCT on a 4x4 sample block.
727 */
728
729GLOBAL(void)
731{
732 INT32 tmp0, tmp1;
733 INT32 tmp10, tmp11;
735 JSAMPROW elemptr;
736 int ctr;
738
739 /* Pre-zero output coefficient block. */
741
742 /* Pass 1: process rows.
743 * Note results are scaled up by sqrt(8) compared to a true DCT;
744 * furthermore, we scale the results by 2**PASS1_BITS.
745 * We must also scale the output by (8/4)**2 = 2**2, which we add here.
746 * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
747 */
748
749 dataptr = data;
750 for (ctr = 0; ctr < 4; ctr++) {
751 elemptr = sample_data[ctr] + start_col;
752
753 /* Even part */
754
755 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
756 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
757
758 tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
759 tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
760
761 /* Apply unsigned->signed conversion. */
762 dataptr[0] = (DCTELEM)
763 ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
764 dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
765
766 /* Odd part */
767
768 tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
769 /* Add fudge factor here for final descale. */
770 tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
771
772 dataptr[1] = (DCTELEM)
773 RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
774 CONST_BITS-PASS1_BITS-2);
775 dataptr[3] = (DCTELEM)
776 RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
777 CONST_BITS-PASS1_BITS-2);
778
779 dataptr += DCTSIZE; /* advance pointer to next row */
780 }
781
782 /* Pass 2: process columns.
783 * We apply the PASS2_BITS scaling, but leave the
784 * results scaled up by an overall factor of 8.
785 * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
786 */
787
788 dataptr = data;
789 for (ctr = 0; ctr < 4; ctr++) {
790 /* Even part */
791
792 /* Add fudge factor here for final descale. */
793#if PASS2_BITS > 1
794 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS2_BITS-1));
795#else
796#if PASS2_BITS > 0
797 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + ONE;
798#else
799 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
800#endif
801#endif
802 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
803
804 tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
805 tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
806
807 dataptr[DCTSIZE*0] = PASS2_OUTPUT(tmp0 + tmp1);
808 dataptr[DCTSIZE*2] = PASS2_OUTPUT(tmp0 - tmp1);
809
810 /* Odd part */
811
812 tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
813 /* Add fudge factor here for final descale. */
814 tmp0 += ONE << (CONST_BITS+PASS2_BITS-1);
815
816 dataptr[DCTSIZE*1] = (DCTELEM)
817 RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
818 CONST_BITS+PASS2_BITS);
819 dataptr[DCTSIZE*3] = (DCTELEM)
820 RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
821 CONST_BITS+PASS2_BITS);
822
823 dataptr++; /* advance pointer to next column */
824 }
825}
826
827
828/*
829 * Perform the forward DCT on a 3x3 sample block.
830 */
831
832GLOBAL(void)
834{
835 INT32 tmp0, tmp1, tmp2;
837 JSAMPROW elemptr;
838 int ctr;
840
841 /* Pre-zero output coefficient block. */
843
844 /* Pass 1: process rows.
845 * Note results are scaled up by sqrt(8) compared to a true DCT;
846 * furthermore, we scale the results by 2**PASS1_BITS.
847 * We scale the results further by 2**2 as part of output adaption
848 * scaling for different DCT size.
849 * cK represents sqrt(2) * cos(K*pi/6).
850 */
851
852 dataptr = data;
853 for (ctr = 0; ctr < 3; ctr++) {
854 elemptr = sample_data[ctr] + start_col;
855
856 /* Even part */
857
858 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
859 tmp1 = GETJSAMPLE(elemptr[1]);
860
861 tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
862
863 /* Apply unsigned->signed conversion. */
864 dataptr[0] = (DCTELEM)
865 ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
866 dataptr[2] = (DCTELEM)
867 DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
868 CONST_BITS-PASS1_BITS-2);
869
870 /* Odd part */
871
872 dataptr[1] = (DCTELEM)
873 DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
874 CONST_BITS-PASS1_BITS-2);
875
876 dataptr += DCTSIZE; /* advance pointer to next row */
877 }
878
879 /* Pass 2: process columns.
880 * We apply the PASS2_BITS scaling, but leave the
881 * results scaled up by an overall factor of 8.
882 * We must also scale the output by (8/3)**2 = 64/9, which we partially
883 * fold into the constant multipliers (other part was done in pass 1):
884 * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
885 */
886
887 dataptr = data;
888 for (ctr = 0; ctr < 3; ctr++) {
889 /* Even part */
890
891 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
892 tmp1 = dataptr[DCTSIZE*1];
893
894 tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
895
896 dataptr[DCTSIZE*0] = (DCTELEM)
897 DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
898 CONST_BITS+PASS2_BITS);
899 dataptr[DCTSIZE*2] = (DCTELEM)
900 DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
901 CONST_BITS+PASS2_BITS);
902
903 /* Odd part */
904
905 dataptr[DCTSIZE*1] = (DCTELEM)
906 DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
907 CONST_BITS+PASS2_BITS);
908
909 dataptr++; /* advance pointer to next column */
910 }
911}
912
913
914/*
915 * Perform the forward DCT on a 2x2 sample block.
916 */
917
918GLOBAL(void)
920{
921 DCTELEM tmp0, tmp1, tmp2, tmp3;
922 JSAMPROW elemptr;
923
924 /* Pre-zero output coefficient block. */
926
927 /* Pass 1: process rows.
928 * Note results are scaled up by sqrt(8) compared to a true DCT.
929 */
930
931 /* Row 0 */
932 elemptr = sample_data[0] + start_col;
933
934 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
935 tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
936
937 /* Row 1 */
938 elemptr = sample_data[1] + start_col;
939
940 tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
941 tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
942
943 /* Pass 2: process columns.
944 * We leave the results scaled up by an overall factor of 8.
945 * We must also scale the output by (8/2)**2 = 2**4.
946 */
947
948 /* Column 0 */
949 /* Apply unsigned->signed conversion. */
950
951#if PASS2_BITS < PASS1_BITS + 4
952 data[DCTSIZE*0] =
953 (tmp0 + tmp2 - 4 * CENTERJSAMPLE) << (4+PASS1_BITS-PASS2_BITS);
954 data[DCTSIZE*1] = (tmp0 - tmp2) << (4+PASS1_BITS-PASS2_BITS);
955
956 /* Column 1 */
957 data[DCTSIZE*0+1] = (tmp1 + tmp3) << (4+PASS1_BITS-PASS2_BITS);
958 data[DCTSIZE*1+1] = (tmp1 - tmp3) << (4+PASS1_BITS-PASS2_BITS);
959#else
960 data[DCTSIZE*0] = tmp0 + tmp2 - 4 * CENTERJSAMPLE;
961 data[DCTSIZE*1] = tmp0 - tmp2;
962
963 /* Column 1 */
964 data[DCTSIZE*0+1] = tmp1 + tmp3;
965 data[DCTSIZE*1+1] = tmp1 - tmp3;
966#endif
967}
968
969
970/*
971 * Perform the forward DCT on a 1x1 sample block.
972 */
973
974GLOBAL(void)
976{
977 DCTELEM dcval;
978
979 /* Pre-zero output coefficient block. */
981
982 dcval = GETJSAMPLE(sample_data[0][start_col]);
983
984 /* We leave the result scaled up by an overall factor of 8. */
985 /* We must also scale the output by (8/1)**2 = 2**6. */
986 /* Apply unsigned->signed conversion. */
987 data[0] = (dcval - CENTERJSAMPLE) << (6+PASS1_BITS-PASS2_BITS);
988}
989
990
991/* Pass 1 bits decrement scaling for block sizes 9, 10, 11. */
992
993#if PASS1_BITS > 0
994#define PASS1_DECR (PASS1_BITS - 1)
995#else
996#define PASS1_DECR 0
997#endif
998
999#if PASS1_DECR > 0
1000#define PASS1_OUTDEC(x) (DCTELEM) ((x) << PASS1_DECR)
1001#else
1002#define PASS1_OUTDEC(x) (DCTELEM) (x)
1003#endif
1004
1005
1006/*
1007 * Perform the forward DCT on a 9x9 sample block.
1008 */
1009
1010GLOBAL(void)
1012{
1013 INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1014 INT32 tmp10, tmp11, tmp12, tmp13;
1015 INT32 z1, z2;
1016 DCTELEM workspace[8];
1018 DCTELEM *wsptr;
1019 JSAMPROW elemptr;
1020 int ctr;
1022
1023 /* Pass 1: process rows.
1024 * Note results are scaled up by sqrt(8) compared to a true DCT;
1025 * furthermore, we scale the results by 2**PASS1_DECR.
1026 * cK represents sqrt(2) * cos(K*pi/18).
1027 */
1028
1029 dataptr = data;
1030 ctr = 0;
1031 for (;;) {
1032 elemptr = sample_data[ctr] + start_col;
1033
1034 /* Even part */
1035
1036 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
1037 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
1038 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
1039 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
1040 tmp4 = GETJSAMPLE(elemptr[4]);
1041
1042 tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
1043 tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
1044 tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
1045 tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
1046
1047 z1 = tmp0 + tmp2 + tmp3;
1048 z2 = tmp1 + tmp4;
1049 /* Apply unsigned->signed conversion. */
1050 dataptr[0] = PASS1_OUTDEC(z1 + z2 - 9 * CENTERJSAMPLE);
1051 dataptr[6] = (DCTELEM)
1052 DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)), /* c6 */
1053 CONST_BITS-PASS1_DECR);
1054 z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049)); /* c2 */
1055 z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
1056 dataptr[2] = (DCTELEM)
1057 DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441)) /* c4 */
1058 + z1 + z2, CONST_BITS-PASS1_DECR);
1059 dataptr[4] = (DCTELEM)
1060 DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608)) /* c8 */
1061 + z1 - z2, CONST_BITS-PASS1_DECR);
1062
1063 /* Odd part */
1064
1065 dataptr[3] = (DCTELEM)
1066 DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
1067 CONST_BITS-PASS1_DECR);
1068
1069 tmp11 = MULTIPLY(tmp11, FIX(1.224744871)); /* c3 */
1070 tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
1071 tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
1072
1073 dataptr[1] = (DCTELEM)
1074 DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-PASS1_DECR);
1075
1076 tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
1077
1078 dataptr[5] = (DCTELEM)
1079 DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-PASS1_DECR);
1080 dataptr[7] = (DCTELEM)
1081 DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-PASS1_DECR);
1082
1083 ctr++;
1084
1085 if (ctr != DCTSIZE) {
1086 if (ctr == 9)
1087 break; /* Done. */
1088 dataptr += DCTSIZE; /* advance pointer to next row */
1089 } else
1090 dataptr = workspace; /* switch pointer to extended workspace */
1091 }
1092
1093 /* Pass 2: process columns.
1094 * We remove the PASS1_DECR scaling, but leave the results scaled up
1095 * by an overall factor of 8.
1096 * We must also scale the output by (8/9)**2 = 64/81, which we partially
1097 * fold into the constant multipliers and final shifting:
1098 * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
1099 */
1100
1101 dataptr = data;
1102 wsptr = workspace;
1103 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1104 /* Even part */
1105
1106 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
1107 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
1108 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
1109 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
1110 tmp4 = dataptr[DCTSIZE*4];
1111
1112 tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
1113 tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
1114 tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
1115 tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
1116
1117 z1 = tmp0 + tmp2 + tmp3;
1118 z2 = tmp1 + tmp4;
1119 dataptr[DCTSIZE*0] = (DCTELEM)
1120 DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)), /* 128/81 */
1121 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1122 dataptr[DCTSIZE*6] = (DCTELEM)
1123 DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)), /* c6 */
1124 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1125 z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287)); /* c2 */
1126 z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
1127 dataptr[DCTSIZE*2] = (DCTELEM)
1128 DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190)) /* c4 */
1129 + z1 + z2, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1130 dataptr[DCTSIZE*4] = (DCTELEM)
1131 DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096)) /* c8 */
1132 + z1 - z2, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1133
1134 /* Odd part */
1135
1136 dataptr[DCTSIZE*3] = (DCTELEM)
1137 DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
1138 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1139
1140 tmp11 = MULTIPLY(tmp11, FIX(1.935399303)); /* c3 */
1141 tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
1142 tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
1143
1144 dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1,
1145 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1146
1147 tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
1148
1149 dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2,
1150 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1151 dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2,
1152 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1153
1154 dataptr++; /* advance pointer to next column */
1155 wsptr++; /* advance pointer to next column */
1156 }
1157}
1158
1159
1160/*
1161 * Perform the forward DCT on a 10x10 sample block.
1162 */
1163
1164GLOBAL(void)
1166{
1167 INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1168 INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1169 DCTELEM workspace[8*2];
1171 DCTELEM *wsptr;
1172 JSAMPROW elemptr;
1173 int ctr;
1175
1176 /* Pass 1: process rows.
1177 * Note results are scaled up by sqrt(8) compared to a true DCT;
1178 * furthermore, we scale the results by 2**PASS1_DECR.
1179 * cK represents sqrt(2) * cos(K*pi/20).
1180 */
1181
1182 dataptr = data;
1183 ctr = 0;
1184 for (;;) {
1185 elemptr = sample_data[ctr] + start_col;
1186
1187 /* Even part */
1188
1189 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
1190 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
1191 tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
1192 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
1193 tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
1194
1195 tmp10 = tmp0 + tmp4;
1196 tmp13 = tmp0 - tmp4;
1197 tmp11 = tmp1 + tmp3;
1198 tmp14 = tmp1 - tmp3;
1199
1200 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
1201 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
1202 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
1203 tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
1204 tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
1205
1206 /* Apply unsigned->signed conversion. */
1207 dataptr[0] =
1208 PASS1_OUTDEC(tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE);
1209 tmp12 += tmp12;
1210 dataptr[4] = (DCTELEM)
1211 DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
1212 MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
1213 CONST_BITS-PASS1_DECR);
1214 tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
1215 dataptr[2] = (DCTELEM)
1216 DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
1217 CONST_BITS-PASS1_DECR);
1218 dataptr[6] = (DCTELEM)
1219 DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
1220 CONST_BITS-PASS1_DECR);
1221
1222 /* Odd part */
1223
1224 tmp10 = tmp0 + tmp4;
1225 tmp11 = tmp1 - tmp3;
1226 dataptr[5] = PASS1_OUTDEC(tmp10 - tmp11 - tmp2);
1227 tmp2 <<= CONST_BITS;
1228 dataptr[1] = (DCTELEM)
1229 DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
1230 MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
1231 MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
1232 MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
1233 CONST_BITS-PASS1_DECR);
1234 tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
1235 MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
1236 tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
1237 (tmp11 << (CONST_BITS - 1)) - tmp2;
1238 dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_DECR);
1239 dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_DECR);
1240
1241 ctr++;
1242
1243 if (ctr != DCTSIZE) {
1244 if (ctr == 10)
1245 break; /* Done. */
1246 dataptr += DCTSIZE; /* advance pointer to next row */
1247 } else
1248 dataptr = workspace; /* switch pointer to extended workspace */
1249 }
1250
1251 /* Pass 2: process columns.
1252 * We remove the PASS1_DECR scaling, but leave the results scaled up
1253 * by an overall factor of 8.
1254 * We must also scale the output by (8/10)**2 = 16/25, which we partially
1255 * fold into the constant multipliers and final shifting:
1256 * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
1257 */
1258
1259 dataptr = data;
1260 wsptr = workspace;
1261 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1262 /* Even part */
1263
1264 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
1265 tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
1266 tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
1267 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
1268 tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
1269
1270 tmp10 = tmp0 + tmp4;
1271 tmp13 = tmp0 - tmp4;
1272 tmp11 = tmp1 + tmp3;
1273 tmp14 = tmp1 - tmp3;
1274
1275 tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
1276 tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
1277 tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
1278 tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
1279 tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
1280
1281 dataptr[DCTSIZE*0] = (DCTELEM)
1282 DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
1283 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1284 tmp12 += tmp12;
1285 dataptr[DCTSIZE*4] = (DCTELEM)
1286 DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
1287 MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
1288 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1289 tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
1290 dataptr[DCTSIZE*2] = (DCTELEM)
1291 DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
1292 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1293 dataptr[DCTSIZE*6] = (DCTELEM)
1294 DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
1295 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1296
1297 /* Odd part */
1298
1299 tmp10 = tmp0 + tmp4;
1300 tmp11 = tmp1 - tmp3;
1301 dataptr[DCTSIZE*5] = (DCTELEM)
1302 DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
1303 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1304 tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
1305 dataptr[DCTSIZE*1] = (DCTELEM)
1306 DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
1307 MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
1308 MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
1309 MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
1310 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1311 tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
1312 MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
1313 tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
1314 MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
1315 dataptr[DCTSIZE*3] = (DCTELEM)
1316 DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1317 dataptr[DCTSIZE*7] = (DCTELEM)
1318 DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1319
1320 dataptr++; /* advance pointer to next column */
1321 wsptr++; /* advance pointer to next column */
1322 }
1323}
1324
1325
1326/*
1327 * Perform the forward DCT on an 11x11 sample block.
1328 */
1329
1330GLOBAL(void)
1332{
1333 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1334 INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1335 INT32 z1, z2, z3;
1336 DCTELEM workspace[8*3];
1338 DCTELEM *wsptr;
1339 JSAMPROW elemptr;
1340 int ctr;
1342
1343 /* Pass 1: process rows.
1344 * Note results are scaled up by sqrt(8) compared to a true DCT;
1345 * furthermore, we scale the results by 2**PASS1_DECR.
1346 * cK represents sqrt(2) * cos(K*pi/22).
1347 */
1348
1349 dataptr = data;
1350 ctr = 0;
1351 for (;;) {
1352 elemptr = sample_data[ctr] + start_col;
1353
1354 /* Even part */
1355
1356 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
1357 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
1358 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
1359 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
1360 tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
1361 tmp5 = GETJSAMPLE(elemptr[5]);
1362
1363 tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
1364 tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
1365 tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
1366 tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
1367 tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
1368
1369 /* Apply unsigned->signed conversion. */
1370 dataptr[0] =
1371 PASS1_OUTDEC(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE);
1372 tmp5 += tmp5;
1373 tmp0 -= tmp5;
1374 tmp1 -= tmp5;
1375 tmp2 -= tmp5;
1376 tmp3 -= tmp5;
1377 tmp4 -= tmp5;
1378 z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) + /* c2 */
1379 MULTIPLY(tmp2 + tmp4, FIX(0.201263574)); /* c10 */
1380 z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931)); /* c6 */
1381 z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156)); /* c4 */
1382 dataptr[2] = (DCTELEM)
1383 DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
1384 - MULTIPLY(tmp4, FIX(1.390975730)), /* c4+c10 */
1385 CONST_BITS-PASS1_DECR);
1386 dataptr[4] = (DCTELEM)
1387 DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
1388 - MULTIPLY(tmp2, FIX(1.356927976)) /* c2 */
1389 + MULTIPLY(tmp4, FIX(0.587485545)), /* c8 */
1390 CONST_BITS-PASS1_DECR);
1391 dataptr[6] = (DCTELEM)
1392 DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
1393 - MULTIPLY(tmp2, FIX(0.788749120)), /* c8+c10 */
1394 CONST_BITS-PASS1_DECR);
1395
1396 /* Odd part */
1397
1398 tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905)); /* c3 */
1399 tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298)); /* c5 */
1400 tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576)); /* c7 */
1401 tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
1402 + MULTIPLY(tmp14, FIX(0.398430003)); /* c9 */
1403 tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576)); /* -c7 */
1404 tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907)); /* -c1 */
1405 tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
1406 - MULTIPLY(tmp14, FIX(1.068791298)); /* c5 */
1407 tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003)); /* c9 */
1408 tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
1409 + MULTIPLY(tmp14, FIX(1.399818907)); /* c1 */
1410 tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
1411 - MULTIPLY(tmp14, FIX(1.286413905)); /* c3 */
1412
1413 dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_DECR);
1414 dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_DECR);
1415 dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_DECR);
1416 dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-PASS1_DECR);
1417
1418 ctr++;
1419
1420 if (ctr != DCTSIZE) {
1421 if (ctr == 11)
1422 break; /* Done. */
1423 dataptr += DCTSIZE; /* advance pointer to next row */
1424 } else
1425 dataptr = workspace; /* switch pointer to extended workspace */
1426 }
1427
1428 /* Pass 2: process columns.
1429 * We remove the PASS1_DECR scaling, but leave the results scaled up
1430 * by an overall factor of 8.
1431 * We must also scale the output by (8/11)**2 = 64/121, which we partially
1432 * fold into the constant multipliers and final shifting:
1433 * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
1434 */
1435
1436 dataptr = data;
1437 wsptr = workspace;
1438 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1439 /* Even part */
1440
1441 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
1442 tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
1443 tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
1444 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
1445 tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
1446 tmp5 = dataptr[DCTSIZE*5];
1447
1448 tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
1449 tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
1450 tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
1451 tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
1452 tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
1453
1454 dataptr[DCTSIZE*0] = (DCTELEM)
1455 DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
1456 FIX(1.057851240)), /* 128/121 */
1457 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1458 tmp5 += tmp5;
1459 tmp0 -= tmp5;
1460 tmp1 -= tmp5;
1461 tmp2 -= tmp5;
1462 tmp3 -= tmp5;
1463 tmp4 -= tmp5;
1464 z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) + /* c2 */
1465 MULTIPLY(tmp2 + tmp4, FIX(0.212906922)); /* c10 */
1466 z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713)); /* c6 */
1467 z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479)); /* c4 */
1468 dataptr[DCTSIZE*2] = (DCTELEM)
1469 DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
1470 - MULTIPLY(tmp4, FIX(1.471445400)), /* c4+c10 */
1471 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1472 dataptr[DCTSIZE*4] = (DCTELEM)
1473 DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
1474 - MULTIPLY(tmp2, FIX(1.435427942)) /* c2 */
1475 + MULTIPLY(tmp4, FIX(0.621472312)), /* c8 */
1476 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1477 dataptr[DCTSIZE*6] = (DCTELEM)
1478 DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
1479 - MULTIPLY(tmp2, FIX(0.834379234)), /* c8+c10 */
1480 CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1481
1482 /* Odd part */
1483
1484 tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544)); /* c3 */
1485 tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199)); /* c5 */
1486 tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568)); /* c7 */
1487 tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
1488 + MULTIPLY(tmp14, FIX(0.421479672)); /* c9 */
1489 tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568)); /* -c7 */
1490 tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167)); /* -c1 */
1491 tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
1492 - MULTIPLY(tmp14, FIX(1.130622199)); /* c5 */
1493 tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672)); /* c9 */
1494 tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
1495 + MULTIPLY(tmp14, FIX(1.480800167)); /* c1 */
1496 tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
1497 - MULTIPLY(tmp14, FIX(1.360834544)); /* c3 */
1498
1499 dataptr[DCTSIZE*1] = (DCTELEM)
1500 DESCALE(tmp0, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1501 dataptr[DCTSIZE*3] = (DCTELEM)
1502 DESCALE(tmp1, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1503 dataptr[DCTSIZE*5] = (DCTELEM)
1504 DESCALE(tmp2, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1505 dataptr[DCTSIZE*7] = (DCTELEM)
1506 DESCALE(tmp3, CONST_BITS+PASS1_DECR+1+PASS2_BITS-PASS1_BITS);
1507
1508 dataptr++; /* advance pointer to next column */
1509 wsptr++; /* advance pointer to next column */
1510 }
1511}
1512
1513
1514/*
1515 * Perform the forward DCT on a 12x12 sample block.
1516 */
1517
1518GLOBAL(void)
1520{
1521 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1522 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1523 DCTELEM workspace[8*4];
1525 DCTELEM *wsptr;
1526 JSAMPROW elemptr;
1527 int ctr;
1529
1530 /* Pass 1: process rows.
1531 * Note results are scaled up by sqrt(8) compared to a true DCT.
1532 * cK represents sqrt(2) * cos(K*pi/24).
1533 */
1534
1535 dataptr = data;
1536 ctr = 0;
1537 for (;;) {
1538 elemptr = sample_data[ctr] + start_col;
1539
1540 /* Even part */
1541
1542 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
1543 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
1544 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
1545 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
1546 tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
1547 tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
1548
1549 tmp10 = tmp0 + tmp5;
1550 tmp13 = tmp0 - tmp5;
1551 tmp11 = tmp1 + tmp4;
1552 tmp14 = tmp1 - tmp4;
1553 tmp12 = tmp2 + tmp3;
1554 tmp15 = tmp2 - tmp3;
1555
1556 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
1557 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
1558 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
1559 tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
1560 tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
1561 tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
1562
1563 /* Apply unsigned->signed conversion. */
1564 dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
1565 dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
1566 dataptr[4] = (DCTELEM)
1567 DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
1568 CONST_BITS);
1569 dataptr[2] = (DCTELEM)
1570 DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
1571 CONST_BITS);
1572
1573 /* Odd part */
1574
1575 tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
1576 tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
1577 tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
1578 tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
1579 tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
1580 tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
1581 + MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
1582 tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
1583 tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
1584 + MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
1585 tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
1586 - MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
1587 tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
1588 - MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
1589
1590 dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
1591 dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
1592 dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
1593 dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
1594
1595 ctr++;
1596
1597 if (ctr != DCTSIZE) {
1598 if (ctr == 12)
1599 break; /* Done. */
1600 dataptr += DCTSIZE; /* advance pointer to next row */
1601 } else
1602 dataptr = workspace; /* switch pointer to extended workspace */
1603 }
1604
1605 /* Pass 2: process columns.
1606 * We leave the results scaled up by an overall factor of 8.
1607 * We must also scale the output by (8/12)**2 = 4/9, which we partially
1608 * fold into the constant multipliers and final shifting:
1609 * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
1610 */
1611
1612 dataptr = data;
1613 wsptr = workspace;
1614 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1615 /* Even part */
1616
1617 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
1618 tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
1619 tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
1620 tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
1621 tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
1622 tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
1623
1624 tmp10 = tmp0 + tmp5;
1625 tmp13 = tmp0 - tmp5;
1626 tmp11 = tmp1 + tmp4;
1627 tmp14 = tmp1 - tmp4;
1628 tmp12 = tmp2 + tmp3;
1629 tmp15 = tmp2 - tmp3;
1630
1631 tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
1632 tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
1633 tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
1634 tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
1635 tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
1636 tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
1637
1638 dataptr[DCTSIZE*0] = (DCTELEM)
1639 DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
1640 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1641 dataptr[DCTSIZE*6] = (DCTELEM)
1642 DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
1643 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1644 dataptr[DCTSIZE*4] = (DCTELEM)
1645 DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
1646 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1647 dataptr[DCTSIZE*2] = (DCTELEM)
1648 DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
1649 MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
1650 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1651
1652 /* Odd part */
1653
1654 tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
1655 tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
1656 tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
1657 tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
1658 tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
1659 tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
1660 + MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
1661 tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
1662 tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
1663 + MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
1664 tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
1665 - MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
1666 tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
1667 - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
1668
1669 dataptr[DCTSIZE*1] = (DCTELEM)
1670 DESCALE(tmp10, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1671 dataptr[DCTSIZE*3] = (DCTELEM)
1672 DESCALE(tmp11, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1673 dataptr[DCTSIZE*5] = (DCTELEM)
1674 DESCALE(tmp12, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1675 dataptr[DCTSIZE*7] = (DCTELEM)
1676 DESCALE(tmp13, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1677
1678 dataptr++; /* advance pointer to next column */
1679 wsptr++; /* advance pointer to next column */
1680 }
1681}
1682
1683
1684/*
1685 * Perform the forward DCT on a 13x13 sample block.
1686 */
1687
1688GLOBAL(void)
1690{
1691 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1692 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1693 INT32 z1, z2;
1694 DCTELEM workspace[8*5];
1696 DCTELEM *wsptr;
1697 JSAMPROW elemptr;
1698 int ctr;
1700
1701 /* Pass 1: process rows.
1702 * Note results are scaled up by sqrt(8) compared to a true DCT.
1703 * cK represents sqrt(2) * cos(K*pi/26).
1704 */
1705
1706 dataptr = data;
1707 ctr = 0;
1708 for (;;) {
1709 elemptr = sample_data[ctr] + start_col;
1710
1711 /* Even part */
1712
1713 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
1714 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
1715 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
1716 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
1717 tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
1718 tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
1719 tmp6 = GETJSAMPLE(elemptr[6]);
1720
1721 tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
1722 tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
1723 tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
1724 tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
1725 tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
1726 tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
1727
1728 /* Apply unsigned->signed conversion. */
1729 dataptr[0] = (DCTELEM)
1730 (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
1731 tmp6 += tmp6;
1732 tmp0 -= tmp6;
1733 tmp1 -= tmp6;
1734 tmp2 -= tmp6;
1735 tmp3 -= tmp6;
1736 tmp4 -= tmp6;
1737 tmp5 -= tmp6;
1738 dataptr[2] = (DCTELEM)
1739 DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) + /* c2 */
1740 MULTIPLY(tmp1, FIX(1.058554052)) + /* c6 */
1741 MULTIPLY(tmp2, FIX(0.501487041)) - /* c10 */
1742 MULTIPLY(tmp3, FIX(0.170464608)) - /* c12 */
1743 MULTIPLY(tmp4, FIX(0.803364869)) - /* c8 */
1744 MULTIPLY(tmp5, FIX(1.252223920)), /* c4 */
1745 CONST_BITS);
1746 z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
1747 MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
1748 MULTIPLY(tmp1 - tmp5, FIX(0.316450131)); /* (c8-c12)/2 */
1749 z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
1750 MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
1751 MULTIPLY(tmp1 + tmp5, FIX(0.486914739)); /* (c8+c12)/2 */
1752
1753 dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
1754 dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
1755
1756 /* Odd part */
1757
1758 tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651)); /* c3 */
1759 tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945)); /* c5 */
1760 tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) + /* c7 */
1761 MULTIPLY(tmp14 + tmp15, FIX(0.338443458)); /* c11 */
1762 tmp0 = tmp1 + tmp2 + tmp3 -
1763 MULTIPLY(tmp10, FIX(2.020082300)) + /* c3+c5+c7-c1 */
1764 MULTIPLY(tmp14, FIX(0.318774355)); /* c9-c11 */
1765 tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) - /* c7 */
1766 MULTIPLY(tmp11 + tmp12, FIX(0.338443458)); /* c11 */
1767 tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
1768 tmp1 += tmp4 + tmp5 +
1769 MULTIPLY(tmp11, FIX(0.837223564)) - /* c5+c9+c11-c3 */
1770 MULTIPLY(tmp14, FIX(2.341699410)); /* c1+c7 */
1771 tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
1772 tmp2 += tmp4 + tmp6 -
1773 MULTIPLY(tmp12, FIX(1.572116027)) + /* c1+c5-c9-c11 */
1774 MULTIPLY(tmp15, FIX(2.260109708)); /* c3+c7 */
1775 tmp3 += tmp5 + tmp6 +
1776 MULTIPLY(tmp13, FIX(2.205608352)) - /* c3+c5+c9-c7 */
1777 MULTIPLY(tmp15, FIX(1.742345811)); /* c1+c11 */
1778
1779 dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
1780 dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
1781 dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
1782 dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
1783
1784 ctr++;
1785
1786 if (ctr != DCTSIZE) {
1787 if (ctr == 13)
1788 break; /* Done. */
1789 dataptr += DCTSIZE; /* advance pointer to next row */
1790 } else
1791 dataptr = workspace; /* switch pointer to extended workspace */
1792 }
1793
1794 /* Pass 2: process columns.
1795 * We leave the results scaled up by an overall factor of 8.
1796 * We must also scale the output by (8/13)**2 = 64/169, which we partially
1797 * fold into the constant multipliers and final shifting:
1798 * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
1799 */
1800
1801 dataptr = data;
1802 wsptr = workspace;
1803 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1804 /* Even part */
1805
1806 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
1807 tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
1808 tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
1809 tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
1810 tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
1811 tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
1812 tmp6 = dataptr[DCTSIZE*6];
1813
1814 tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
1815 tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
1816 tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
1817 tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
1818 tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
1819 tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
1820
1821 dataptr[DCTSIZE*0] = (DCTELEM)
1822 DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
1823 FIX(0.757396450)), /* 128/169 */
1824 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1825 tmp6 += tmp6;
1826 tmp0 -= tmp6;
1827 tmp1 -= tmp6;
1828 tmp2 -= tmp6;
1829 tmp3 -= tmp6;
1830 tmp4 -= tmp6;
1831 tmp5 -= tmp6;
1832 dataptr[DCTSIZE*2] = (DCTELEM)
1833 DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) + /* c2 */
1834 MULTIPLY(tmp1, FIX(0.801745081)) + /* c6 */
1835 MULTIPLY(tmp2, FIX(0.379824504)) - /* c10 */
1836 MULTIPLY(tmp3, FIX(0.129109289)) - /* c12 */
1837 MULTIPLY(tmp4, FIX(0.608465700)) - /* c8 */
1838 MULTIPLY(tmp5, FIX(0.948429952)), /* c4 */
1839 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1840 z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
1841 MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
1842 MULTIPLY(tmp1 - tmp5, FIX(0.239678205)); /* (c8-c12)/2 */
1843 z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
1844 MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
1845 MULTIPLY(tmp1 + tmp5, FIX(0.368787494)); /* (c8+c12)/2 */
1846
1847 dataptr[DCTSIZE*4] = (DCTELEM)
1848 DESCALE(z1 + z2, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1849 dataptr[DCTSIZE*6] = (DCTELEM)
1850 DESCALE(z1 - z2, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1851
1852 /* Odd part */
1853
1854 tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908)); /* c3 */
1855 tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751)); /* c5 */
1856 tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) + /* c7 */
1857 MULTIPLY(tmp14 + tmp15, FIX(0.256335874)); /* c11 */
1858 tmp0 = tmp1 + tmp2 + tmp3 -
1859 MULTIPLY(tmp10, FIX(1.530003162)) + /* c3+c5+c7-c1 */
1860 MULTIPLY(tmp14, FIX(0.241438564)); /* c9-c11 */
1861 tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) - /* c7 */
1862 MULTIPLY(tmp11 + tmp12, FIX(0.256335874)); /* c11 */
1863 tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
1864 tmp1 += tmp4 + tmp5 +
1865 MULTIPLY(tmp11, FIX(0.634110155)) - /* c5+c9+c11-c3 */
1866 MULTIPLY(tmp14, FIX(1.773594819)); /* c1+c7 */
1867 tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
1868 tmp2 += tmp4 + tmp6 -
1869 MULTIPLY(tmp12, FIX(1.190715098)) + /* c1+c5-c9-c11 */
1870 MULTIPLY(tmp15, FIX(1.711799069)); /* c3+c7 */
1871 tmp3 += tmp5 + tmp6 +
1872 MULTIPLY(tmp13, FIX(1.670519935)) - /* c3+c5+c9-c7 */
1873 MULTIPLY(tmp15, FIX(1.319646532)); /* c1+c11 */
1874
1875 dataptr[DCTSIZE*1] = (DCTELEM)
1876 DESCALE(tmp0, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1877 dataptr[DCTSIZE*3] = (DCTELEM)
1878 DESCALE(tmp1, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1879 dataptr[DCTSIZE*5] = (DCTELEM)
1880 DESCALE(tmp2, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1881 dataptr[DCTSIZE*7] = (DCTELEM)
1882 DESCALE(tmp3, CONST_BITS+1+PASS2_BITS-PASS1_BITS);
1883
1884 dataptr++; /* advance pointer to next column */
1885 wsptr++; /* advance pointer to next column */
1886 }
1887}
1888
1889
1890/*
1891 * Perform the forward DCT on a 14x14 sample block.
1892 */
1893
1894GLOBAL(void)
1896{
1897 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1898 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1899 DCTELEM workspace[8*6];
1901 DCTELEM *wsptr;
1902 JSAMPROW elemptr;
1903 int ctr;
1905
1906 /* Pass 1: process rows.
1907 * Note results are scaled up by sqrt(8) compared to a true DCT.
1908 * cK represents sqrt(2) * cos(K*pi/28).
1909 */
1910
1911 dataptr = data;
1912 ctr = 0;
1913 for (;;) {
1914 elemptr = sample_data[ctr] + start_col;
1915
1916 /* Even part */
1917
1918 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
1919 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
1920 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
1921 tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
1922 tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
1923 tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
1924 tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
1925
1926 tmp10 = tmp0 + tmp6;
1927 tmp14 = tmp0 - tmp6;
1928 tmp11 = tmp1 + tmp5;
1929 tmp15 = tmp1 - tmp5;
1930 tmp12 = tmp2 + tmp4;
1931 tmp16 = tmp2 - tmp4;
1932
1933 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
1934 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
1935 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
1936 tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
1937 tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
1938 tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
1939 tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
1940
1941 /* Apply unsigned->signed conversion. */
1942 dataptr[0] = (DCTELEM)
1943 (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
1944 tmp13 += tmp13;
1945 dataptr[4] = (DCTELEM)
1946 DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
1947 MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
1948 MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
1949 CONST_BITS);
1950
1951 tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
1952
1953 dataptr[2] = (DCTELEM)
1954 DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
1955 + MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
1956 CONST_BITS);
1957 dataptr[6] = (DCTELEM)
1958 DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
1959 - MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
1960 CONST_BITS);
1961
1962 /* Odd part */
1963
1964 tmp10 = tmp1 + tmp2;
1965 tmp11 = tmp5 - tmp4;
1966 dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
1967 tmp3 <<= CONST_BITS;
1968 tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
1969 tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
1970 tmp10 += tmp11 - tmp3;
1971 tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
1972 MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
1973 dataptr[5] = (DCTELEM)
1974 DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
1975 + MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
1976 CONST_BITS);
1977 tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
1978 MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
1979 dataptr[3] = (DCTELEM)
1980 DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
1981 - MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
1982 CONST_BITS);
1983 dataptr[1] = (DCTELEM)
1984 DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
1985 MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
1986 CONST_BITS);
1987
1988 ctr++;
1989
1990 if (ctr != DCTSIZE) {
1991 if (ctr == 14)
1992 break; /* Done. */
1993 dataptr += DCTSIZE; /* advance pointer to next row */
1994 } else
1995 dataptr = workspace; /* switch pointer to extended workspace */
1996 }
1997
1998 /* Pass 2: process columns.
1999 * We leave the results scaled up by an overall factor of 8.
2000 * We must also scale the output by (8/14)**2 = 16/49, which we partially
2001 * fold into the constant multipliers and final shifting:
2002 * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
2003 */
2004
2005 dataptr = data;
2006 wsptr = workspace;
2007 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2008 /* Even part */
2009
2010 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
2011 tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
2012 tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
2013 tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
2014 tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
2015 tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
2016 tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
2017
2018 tmp10 = tmp0 + tmp6;
2019 tmp14 = tmp0 - tmp6;
2020 tmp11 = tmp1 + tmp5;
2021 tmp15 = tmp1 - tmp5;
2022 tmp12 = tmp2 + tmp4;
2023 tmp16 = tmp2 - tmp4;
2024
2025 tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
2026 tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
2027 tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
2028 tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
2029 tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
2030 tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
2031 tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
2032
2033 dataptr[DCTSIZE*0] = (DCTELEM)
2034 DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
2035 FIX(0.653061224)), /* 32/49 */
2036 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2037 tmp13 += tmp13;
2038 dataptr[DCTSIZE*4] = (DCTELEM)
2039 DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
2040 MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
2041 MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
2042 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2043
2044 tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
2045
2046 dataptr[DCTSIZE*2] = (DCTELEM)
2047 DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
2048 + MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
2049 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2050 dataptr[DCTSIZE*6] = (DCTELEM)
2051 DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
2052 - MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
2053 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2054
2055 /* Odd part */
2056
2057 tmp10 = tmp1 + tmp2;
2058 tmp11 = tmp5 - tmp4;
2059 dataptr[DCTSIZE*7] = (DCTELEM)
2060 DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
2061 FIX(0.653061224)), /* 32/49 */
2062 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2063 tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
2064 tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
2065 tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
2066 tmp10 += tmp11 - tmp3;
2067 tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
2068 MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
2069 dataptr[DCTSIZE*5] = (DCTELEM)
2070 DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
2071 + MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
2072 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2073 tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
2074 MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
2075 dataptr[DCTSIZE*3] = (DCTELEM)
2076 DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
2077 - MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
2078 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2079 dataptr[DCTSIZE*1] = (DCTELEM)
2080 DESCALE(tmp11 + tmp12 + tmp3
2081 - MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
2082 - MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
2083 CONST_BITS+1+PASS2_BITS-PASS1_BITS);
2084
2085 dataptr++; /* advance pointer to next column */
2086 wsptr++; /* advance pointer to next column */
2087 }
2088}
2089
2090
2091/*
2092 * Perform the forward DCT on a 15x15 sample block.
2093 */
2094
2095GLOBAL(void)
2097{
2098 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2099 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2100 INT32 z1, z2, z3;
2101 DCTELEM workspace[8*7];
2103 DCTELEM *wsptr;
2104 JSAMPROW elemptr;
2105 int ctr;
2107
2108 /* Pass 1: process rows.
2109 * Note results are scaled up by sqrt(8) compared to a true DCT.
2110 * cK represents sqrt(2) * cos(K*pi/30).
2111 */
2112
2113 dataptr = data;
2114 ctr = 0;
2115 for (;;) {
2116 elemptr = sample_data[ctr] + start_col;
2117
2118 /* Even part */
2119
2120 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
2121 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
2122 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
2123 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
2124 tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
2125 tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
2126 tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
2127 tmp7 = GETJSAMPLE(elemptr[7]);
2128
2129 tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
2130 tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
2131 tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
2132 tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
2133 tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
2134 tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
2135 tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
2136
2137 z1 = tmp0 + tmp4 + tmp5;
2138 z2 = tmp1 + tmp3 + tmp6;
2139 z3 = tmp2 + tmp7;
2140 /* Apply unsigned->signed conversion. */
2141 dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
2142 z3 += z3;
2143 dataptr[6] = (DCTELEM)
2144 DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
2145 MULTIPLY(z2 - z3, FIX(0.437016024)), /* c12 */
2146 CONST_BITS);
2147 tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2148 z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) - /* c2+c14 */
2149 MULTIPLY(tmp6 - tmp2, FIX(2.238241955)); /* c4+c8 */
2150 z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) - /* c8-c14 */
2151 MULTIPLY(tmp0 - tmp2, FIX(0.091361227)); /* c2-c4 */
2152 z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) + /* c2 */
2153 MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) + /* c8 */
2154 MULTIPLY(tmp1 - tmp4, FIX(0.790569415)); /* (c6+c12)/2 */
2155
2156 dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
2157 dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
2158
2159 /* Odd part */
2160
2161 tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2162 FIX(1.224744871)); /* c5 */
2163 tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
2164 MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876)); /* c9 */
2165 tmp12 = MULTIPLY(tmp12, FIX(1.224744871)); /* c5 */
2166 tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) + /* c1 */
2167 MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) + /* c3 */
2168 MULTIPLY(tmp13 + tmp15, FIX(0.575212477)); /* c11 */
2169 tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) - /* c7-c11 */
2170 MULTIPLY(tmp14, FIX(0.513743148)) + /* c3-c9 */
2171 MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12; /* c1+c13 */
2172 tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) - /* -(c1-c7) */
2173 MULTIPLY(tmp11, FIX(2.176250899)) - /* c3+c9 */
2174 MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12; /* c11+c13 */
2175
2176 dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
2177 dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
2178 dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
2179 dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
2180
2181 ctr++;
2182
2183 if (ctr != DCTSIZE) {
2184 if (ctr == 15)
2185 break; /* Done. */
2186 dataptr += DCTSIZE; /* advance pointer to next row */
2187 } else
2188 dataptr = workspace; /* switch pointer to extended workspace */
2189 }
2190
2191 /* Pass 2: process columns.
2192 * We leave the results scaled up by an overall factor of 8.
2193 * We must also scale the output by (8/15)**2 = 64/225, which we partially
2194 * fold into the constant multipliers and final shifting:
2195 * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
2196 */
2197
2198 dataptr = data;
2199 wsptr = workspace;
2200 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2201 /* Even part */
2202
2203 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
2204 tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
2205 tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
2206 tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
2207 tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
2208 tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
2209 tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
2210 tmp7 = dataptr[DCTSIZE*7];
2211
2212 tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
2213 tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
2214 tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
2215 tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
2216 tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
2217 tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
2218 tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
2219
2220 z1 = tmp0 + tmp4 + tmp5;
2221 z2 = tmp1 + tmp3 + tmp6;
2222 z3 = tmp2 + tmp7;
2223 dataptr[DCTSIZE*0] = (DCTELEM)
2224 DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
2225 CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2226 z3 += z3;
2227 dataptr[DCTSIZE*6] = (DCTELEM)
2228 DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
2229 MULTIPLY(z2 - z3, FIX(0.497227121)), /* c12 */
2230 CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2231 tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2232 z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) - /* c2+c14 */
2233 MULTIPLY(tmp6 - tmp2, FIX(2.546621957)); /* c4+c8 */
2234 z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) - /* c8-c14 */
2235 MULTIPLY(tmp0 - tmp2, FIX(0.103948774)); /* c2-c4 */
2236 z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) + /* c2 */
2237 MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) + /* c8 */
2238 MULTIPLY(tmp1 - tmp4, FIX(0.899492312)); /* (c6+c12)/2 */
2239
2240 dataptr[DCTSIZE*2] = (DCTELEM)
2241 DESCALE(z1 + z3, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2242 dataptr[DCTSIZE*4] = (DCTELEM)
2243 DESCALE(z2 + z3, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2244
2245 /* Odd part */
2246
2247 tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2248 FIX(1.393487498)); /* c5 */
2249 tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
2250 MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187)); /* c9 */
2251 tmp12 = MULTIPLY(tmp12, FIX(1.393487498)); /* c5 */
2252 tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) + /* c1 */
2253 MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) + /* c3 */
2254 MULTIPLY(tmp13 + tmp15, FIX(0.654463974)); /* c11 */
2255 tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) - /* c7-c11 */
2256 MULTIPLY(tmp14, FIX(0.584525538)) + /* c3-c9 */
2257 MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12; /* c1+c13 */
2258 tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) - /* -(c1-c7) */
2259 MULTIPLY(tmp11, FIX(2.476089912)) - /* c3+c9 */
2260 MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12; /* c11+c13 */
2261
2262 dataptr[DCTSIZE*1] = (DCTELEM)
2263 DESCALE(tmp0, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2264 dataptr[DCTSIZE*3] = (DCTELEM)
2265 DESCALE(tmp1, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2266 dataptr[DCTSIZE*5] = (DCTELEM)
2267 DESCALE(tmp2, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2268 dataptr[DCTSIZE*7] = (DCTELEM)
2269 DESCALE(tmp3, CONST_BITS+2+PASS2_BITS-PASS1_BITS);
2270
2271 dataptr++; /* advance pointer to next column */
2272 wsptr++; /* advance pointer to next column */
2273 }
2274}
2275
2276
2277/*
2278 * Perform the forward DCT on a 16x16 sample block.
2279 */
2280
2281GLOBAL(void)
2283{
2284 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2285 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2286 DCTELEM workspace[DCTSIZE2];
2288 DCTELEM *wsptr;
2289 JSAMPROW elemptr;
2290 int ctr;
2292
2293 /* Pass 1: process rows.
2294 * Note results are scaled up by sqrt(8) compared to a true DCT;
2295 * furthermore, we scale the results by 2**PASS1_BITS.
2296 * cK represents sqrt(2) * cos(K*pi/32).
2297 */
2298
2299 dataptr = data;
2300 ctr = 0;
2301 for (;;) {
2302 elemptr = sample_data[ctr] + start_col;
2303
2304 /* Even part */
2305
2306 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2307 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2308 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2309 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2310 tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2311 tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2312 tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2313 tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2314
2315 tmp10 = tmp0 + tmp7;
2316 tmp14 = tmp0 - tmp7;
2317 tmp11 = tmp1 + tmp6;
2318 tmp15 = tmp1 - tmp6;
2319 tmp12 = tmp2 + tmp5;
2320 tmp16 = tmp2 - tmp5;
2321 tmp13 = tmp3 + tmp4;
2322 tmp17 = tmp3 - tmp4;
2323
2324 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2325 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2326 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2327 tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2328 tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2329 tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2330 tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2331 tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2332
2333 /* Apply unsigned->signed conversion. */
2334 dataptr[0] =
2335 PASS1_OUTPUT(tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE);
2336 dataptr[4] = (DCTELEM)
2337 DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2338 MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2339 CONST_BITS-PASS1_BITS);
2340
2341 tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2342 MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2343
2344 dataptr[2] = (DCTELEM)
2345 DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2346 + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
2347 CONST_BITS-PASS1_BITS);
2348 dataptr[6] = (DCTELEM)
2349 DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2350 - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2351 CONST_BITS-PASS1_BITS);
2352
2353 /* Odd part */
2354
2355 tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2356 MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2357 tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2358 MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2359 tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2360 MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2361 tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2362 MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2363 tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2364 MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2365 tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2366 MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2367 tmp10 = tmp11 + tmp12 + tmp13 -
2368 MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2369 MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2370 tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2371 - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2372 tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2373 + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2374 tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2375 + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2376
2377 dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2378 dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2379 dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2380 dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2381
2382 ctr++;
2383
2384 if (ctr != DCTSIZE) {
2385 if (ctr == DCTSIZE * 2)
2386 break; /* Done. */
2387 dataptr += DCTSIZE; /* advance pointer to next row */
2388 } else
2389 dataptr = workspace; /* switch pointer to extended workspace */
2390 }
2391
2392 /* Pass 2: process columns.
2393 * We apply the PASS2_BITS scaling, but leave the
2394 * results scaled up by an overall factor of 8.
2395 * We must also scale the output by (8/16)**2 = 1/2**2.
2396 * cK represents sqrt(2) * cos(K*pi/32).
2397 */
2398
2399 dataptr = data;
2400 wsptr = workspace;
2401 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2402 /* Even part */
2403
2404 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
2405 tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
2406 tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
2407 tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
2408 tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
2409 tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
2410 tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
2411 tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
2412
2413 tmp10 = tmp0 + tmp7;
2414 tmp14 = tmp0 - tmp7;
2415 tmp11 = tmp1 + tmp6;
2416 tmp15 = tmp1 - tmp6;
2417 tmp12 = tmp2 + tmp5;
2418 tmp16 = tmp2 - tmp5;
2419 tmp13 = tmp3 + tmp4;
2420 tmp17 = tmp3 - tmp4;
2421
2422 tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
2423 tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
2424 tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
2425 tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
2426 tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
2427 tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
2428 tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
2429 tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
2430
2431 dataptr[DCTSIZE*0] = (DCTELEM)
2432 DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS2_BITS+2);
2433 dataptr[DCTSIZE*4] = (DCTELEM)
2434 DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2435 MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2436 CONST_BITS+PASS2_BITS+2);
2437
2438 tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2439 MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2440
2441 dataptr[DCTSIZE*2] = (DCTELEM)
2442 DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2443 + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+10 */
2444 CONST_BITS+PASS2_BITS+2);
2445 dataptr[DCTSIZE*6] = (DCTELEM)
2446 DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2447 - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2448 CONST_BITS+PASS2_BITS+2);
2449
2450 /* Odd part */
2451
2452 tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2453 MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2454 tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2455 MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2456 tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2457 MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2458 tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2459 MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2460 tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2461 MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2462 tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2463 MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2464 tmp10 = tmp11 + tmp12 + tmp13 -
2465 MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2466 MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2467 tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2468 - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2469 tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2470 + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2471 tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2472 + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2473
2474 dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS2_BITS+2);
2475 dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS2_BITS+2);
2476 dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS2_BITS+2);
2477 dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS2_BITS+2);
2478
2479 dataptr++; /* advance pointer to next column */
2480 wsptr++; /* advance pointer to next column */
2481 }
2482}
2483
2484
2485/*
2486 * Perform the forward DCT on a 16x8 sample block.
2487 *
2488 * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
2489 */
2490
2491GLOBAL(void)
2493{
2494 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2495 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2496 INT32 z1;
2498 JSAMPROW elemptr;
2499 int ctr;
2501
2502 /* Pass 1: process rows.
2503 * Note results are scaled up by sqrt(8) compared to a true DCT;
2504 * furthermore, we scale the results by 2**PASS1_BITS.
2505 * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
2506 */
2507
2508 dataptr = data;
2509 ctr = 0;
2510 for (ctr = 0; ctr < DCTSIZE; ctr++) {
2511 elemptr = sample_data[ctr] + start_col;
2512
2513 /* Even part */
2514
2515 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2516 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2517 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2518 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2519 tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2520 tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2521 tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2522 tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2523
2524 tmp10 = tmp0 + tmp7;
2525 tmp14 = tmp0 - tmp7;
2526 tmp11 = tmp1 + tmp6;
2527 tmp15 = tmp1 - tmp6;
2528 tmp12 = tmp2 + tmp5;
2529 tmp16 = tmp2 - tmp5;
2530 tmp13 = tmp3 + tmp4;
2531 tmp17 = tmp3 - tmp4;
2532
2533 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2534 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2535 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2536 tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2537 tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2538 tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2539 tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2540 tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2541
2542 /* Apply unsigned->signed conversion. */
2543 dataptr[0] =
2544 PASS1_OUTPUT(tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE);
2545 dataptr[4] = (DCTELEM)
2546 DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2547 MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2548 CONST_BITS-PASS1_BITS);
2549
2550 tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2551 MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2552
2553 dataptr[2] = (DCTELEM)
2554 DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2555 + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
2556 CONST_BITS-PASS1_BITS);
2557 dataptr[6] = (DCTELEM)
2558 DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2559 - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2560 CONST_BITS-PASS1_BITS);
2561
2562 /* Odd part */
2563
2564 tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2565 MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2566 tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2567 MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2568 tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2569 MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2570 tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2571 MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2572 tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2573 MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2574 tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2575 MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2576 tmp10 = tmp11 + tmp12 + tmp13 -
2577 MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2578 MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2579 tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2580 - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2581 tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2582 + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2583 tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2584 + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2585
2586 dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2587 dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2588 dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2589 dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2590
2591 dataptr += DCTSIZE; /* advance pointer to next row */
2592 }
2593
2594 /* Pass 2: process columns.
2595 * We apply the PASS2_BITS scaling, but leave the
2596 * results scaled up by an overall factor of 8.
2597 * We must also scale the output by 8/16 = 1/2.
2598 * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2599 */
2600
2601 dataptr = data;
2602 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2603 /* Even part per LL&M figure 1 --- note that published figure is faulty;
2604 * rotator "c1" should be "c6".
2605 */
2606
2607 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
2608 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
2609 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
2610 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
2611
2612 /* Add fudge factor here for final descale. */
2613#if PASS2_BITS > 0
2614 tmp10 = tmp0 + tmp3 + (ONE << PASS2_BITS);
2615#else
2616 tmp10 = tmp0 + tmp3 + ONE;
2617#endif
2618 tmp12 = tmp0 - tmp3;
2619 tmp11 = tmp1 + tmp2;
2620 tmp13 = tmp1 - tmp2;
2621
2622 tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
2623 tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
2624 tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
2625 tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
2626
2627 dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS2_BITS+1);
2628 dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS2_BITS+1);
2629
2630 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
2631 /* Add fudge factor here for final descale. */
2632 z1 += ONE << (CONST_BITS+PASS2_BITS);
2633
2634 dataptr[DCTSIZE*2] = (DCTELEM)
2635 RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
2636 CONST_BITS+PASS2_BITS+1);
2637 dataptr[DCTSIZE*6] = (DCTELEM)
2638 RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
2639 CONST_BITS+PASS2_BITS+1);
2640
2641 /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
2642 * i0..i3 in the paper are tmp0..tmp3 here.
2643 */
2644
2645 tmp12 = tmp0 + tmp2;
2646 tmp13 = tmp1 + tmp3;
2647
2648 z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
2649 /* Add fudge factor here for final descale. */
2650 z1 += ONE << (CONST_BITS+PASS2_BITS);
2651
2652 tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
2653 tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
2654 tmp12 += z1;
2655 tmp13 += z1;
2656
2657 z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
2658 tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
2659 tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
2660 tmp0 += z1 + tmp12;
2661 tmp3 += z1 + tmp13;
2662
2663 z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
2664 tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
2665 tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
2666 tmp1 += z1 + tmp13;
2667 tmp2 += z1 + tmp12;
2668
2669 dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS2_BITS+1);
2670 dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS2_BITS+1);
2671 dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS2_BITS+1);
2672 dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS2_BITS+1);
2673
2674 dataptr++; /* advance pointer to next column */
2675 }
2676}
2677
2678
2679/*
2680 * Perform the forward DCT on a 14x7 sample block.
2681 *
2682 * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
2683 */
2684
2685GLOBAL(void)
2687{
2688 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
2689 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2690 INT32 z1, z2, z3;
2692 JSAMPROW elemptr;
2693 int ctr;
2695
2696 /* Zero bottom row of output coefficient block. */
2698
2699 /* Pass 1: process rows.
2700 * Note results are scaled up by sqrt(8) compared to a true DCT;
2701 * furthermore, we scale the results by 2**PASS1_BITS.
2702 * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
2703 */
2704
2705 dataptr = data;
2706 for (ctr = 0; ctr < 7; ctr++) {
2707 elemptr = sample_data[ctr] + start_col;
2708
2709 /* Even part */
2710
2711 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
2712 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
2713 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
2714 tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
2715 tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
2716 tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
2717 tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
2718
2719 tmp10 = tmp0 + tmp6;
2720 tmp14 = tmp0 - tmp6;
2721 tmp11 = tmp1 + tmp5;
2722 tmp15 = tmp1 - tmp5;
2723 tmp12 = tmp2 + tmp4;
2724 tmp16 = tmp2 - tmp4;
2725
2726 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
2727 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
2728 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
2729 tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
2730 tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
2731 tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
2732 tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
2733
2734 /* Apply unsigned->signed conversion. */
2735 dataptr[0] =
2736 PASS1_OUTPUT(tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
2737 tmp13 += tmp13;
2738 dataptr[4] = (DCTELEM)
2739 DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
2740 MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
2741 MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
2742 CONST_BITS-PASS1_BITS);
2743
2744 tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
2745
2746 dataptr[2] = (DCTELEM)
2747 DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
2748 + MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
2749 CONST_BITS-PASS1_BITS);
2750 dataptr[6] = (DCTELEM)
2751 DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
2752 - MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
2753 CONST_BITS-PASS1_BITS);
2754
2755 /* Odd part */
2756
2757 tmp10 = tmp1 + tmp2;
2758 tmp11 = tmp5 - tmp4;
2759 dataptr[7] = PASS1_OUTPUT(tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
2760 tmp3 <<= CONST_BITS;
2761 tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
2762 tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
2763 tmp10 += tmp11 - tmp3;
2764 tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
2765 MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
2766 dataptr[5] = (DCTELEM)
2767 DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
2768 + MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
2769 CONST_BITS-PASS1_BITS);
2770 tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
2771 MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
2772 dataptr[3] = (DCTELEM)
2773 DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
2774 - MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
2775 CONST_BITS-PASS1_BITS);
2776 dataptr[1] = (DCTELEM)
2777 DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
2778 MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
2779 CONST_BITS-PASS1_BITS);
2780
2781 dataptr += DCTSIZE; /* advance pointer to next row */
2782 }
2783
2784 /* Pass 2: process columns.
2785 * We apply the PASS2_BITS scaling, but leave the
2786 * results scaled up by an overall factor of 8.
2787 * We must also scale the output by (8/14)*(8/7) = 32/49, which we
2788 * partially fold into the constant multipliers and final shifting:
2789 * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
2790 */
2791
2792 dataptr = data;
2793 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2794 /* Even part */
2795
2796 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
2797 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
2798 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
2799 tmp3 = dataptr[DCTSIZE*3];
2800
2801 tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
2802 tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
2803 tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
2804
2805 z1 = tmp0 + tmp2;
2806 dataptr[DCTSIZE*0] = (DCTELEM)
2807 DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
2808 CONST_BITS+PASS2_BITS+1);
2809 tmp3 += tmp3;
2810 z1 -= tmp3;
2811 z1 -= tmp3;
2812 z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
2813 z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
2814 z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
2815 dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS2_BITS+1);
2816 z1 -= z2;
2817 z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
2818 dataptr[DCTSIZE*4] = (DCTELEM)
2819 DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
2820 CONST_BITS+PASS2_BITS+1);
2821 dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS2_BITS+1);
2822
2823 /* Odd part */
2824
2825 tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
2826 tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
2827 tmp0 = tmp1 - tmp2;
2828 tmp1 += tmp2;
2829 tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
2830 tmp1 += tmp2;
2831 tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
2832 tmp0 += tmp3;
2833 tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
2834
2835 dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS2_BITS+1);
2836 dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS2_BITS+1);
2837 dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS2_BITS+1);
2838
2839 dataptr++; /* advance pointer to next column */
2840 }
2841}
2842
2843
2844/*
2845 * Perform the forward DCT on a 12x6 sample block.
2846 *
2847 * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
2848 */
2849
2850GLOBAL(void)
2852{
2853 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
2854 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2856 JSAMPROW elemptr;
2857 int ctr;
2859
2860 /* Zero 2 bottom rows of output coefficient block. */
2862
2863 /* Pass 1: process rows.
2864 * Note results are scaled up by sqrt(8) compared to a true DCT;
2865 * furthermore, we scale the results by 2**PASS1_BITS.
2866 * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
2867 */
2868
2869 dataptr = data;
2870 for (ctr = 0; ctr < 6; ctr++) {
2871 elemptr = sample_data[ctr] + start_col;
2872
2873 /* Even part */
2874
2875 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
2876 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
2877 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
2878 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
2879 tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
2880 tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
2881
2882 tmp10 = tmp0 + tmp5;
2883 tmp13 = tmp0 - tmp5;
2884 tmp11 = tmp1 + tmp4;
2885 tmp14 = tmp1 - tmp4;
2886 tmp12 = tmp2 + tmp3;
2887 tmp15 = tmp2 - tmp3;
2888
2889 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
2890 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
2891 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
2892 tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
2893 tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
2894 tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
2895
2896 /* Apply unsigned->signed conversion. */
2897 dataptr[0] =
2898 PASS1_OUTPUT(tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
2899 dataptr[6] = PASS1_OUTPUT(tmp13 - tmp14 - tmp15);
2900 dataptr[4] = (DCTELEM)
2901 DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
2902 CONST_BITS-PASS1_BITS);
2903 dataptr[2] = (DCTELEM)
2904 DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
2905 CONST_BITS-PASS1_BITS);
2906
2907 /* Odd part */
2908
2909 tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
2910 tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
2911 tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
2912 tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
2913 tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
2914 tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
2915 + MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
2916 tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
2917 tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
2918 + MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
2919 tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
2920 - MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
2921 tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
2922 - MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
2923
2924 dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2925 dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2926 dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2927 dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2928
2929 dataptr += DCTSIZE; /* advance pointer to next row */
2930 }
2931
2932 /* Pass 2: process columns.
2933 * We apply the PASS2_BITS scaling, but leave the
2934 * results scaled up by an overall factor of 8.
2935 * We must also scale the output by (8/12)*(8/6) = 8/9, which we
2936 * partially fold into the constant multipliers and final shifting:
2937 * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
2938 */
2939
2940 dataptr = data;
2941 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2942 /* Even part */
2943
2944 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
2945 tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
2946 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
2947
2948 tmp10 = tmp0 + tmp2;
2949 tmp12 = tmp0 - tmp2;
2950
2951 tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
2952 tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
2953 tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
2954
2955 dataptr[DCTSIZE*0] = (DCTELEM)
2956 DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
2957 CONST_BITS+PASS2_BITS+1);
2958 dataptr[DCTSIZE*2] = (DCTELEM)
2959 DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
2960 CONST_BITS+PASS2_BITS+1);
2961 dataptr[DCTSIZE*4] = (DCTELEM)
2962 DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
2963 CONST_BITS+PASS2_BITS+1);
2964
2965 /* Odd part */
2966
2967 tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
2968
2969 dataptr[DCTSIZE*1] = (DCTELEM)
2970 DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
2971 CONST_BITS+PASS2_BITS+1);
2972 dataptr[DCTSIZE*3] = (DCTELEM)
2973 DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
2974 CONST_BITS+PASS2_BITS+1);
2975 dataptr[DCTSIZE*5] = (DCTELEM)
2976 DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
2977 CONST_BITS+PASS2_BITS+1);
2978
2979 dataptr++; /* advance pointer to next column */
2980 }
2981}
2982
2983
2984/*
2985 * Perform the forward DCT on a 10x5 sample block.
2986 *
2987 * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
2988 */
2989
2990GLOBAL(void)
2992{
2993 INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
2994 INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
2996 JSAMPROW elemptr;
2997 int ctr;
2999
3000 /* Zero 3 bottom rows of output coefficient block. */
3002
3003 /* Pass 1: process rows.
3004 * Note results are scaled up by sqrt(8) compared to a true DCT;
3005 * furthermore, we scale the results by 2**PASS1_BITS.
3006 * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
3007 */
3008
3009 dataptr = data;
3010 for (ctr = 0; ctr < 5; ctr++) {
3011 elemptr = sample_data[ctr] + start_col;
3012
3013 /* Even part */
3014
3015 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
3016 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
3017 tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
3018 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
3019 tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
3020
3021 tmp10 = tmp0 + tmp4;
3022 tmp13 = tmp0 - tmp4;
3023 tmp11 = tmp1 + tmp3;
3024 tmp14 = tmp1 - tmp3;
3025
3026 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
3027 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
3028 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
3029 tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
3030 tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
3031
3032 /* Apply unsigned->signed conversion. */
3033 dataptr[0] =
3034 PASS1_OUTPUT(tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE);
3035 tmp12 += tmp12;
3036 dataptr[4] = (DCTELEM)
3037 DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
3038 MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
3039 CONST_BITS-PASS1_BITS);
3040 tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
3041 dataptr[2] = (DCTELEM)
3042 DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
3043 CONST_BITS-PASS1_BITS);
3044 dataptr[6] = (DCTELEM)
3045 DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
3046 CONST_BITS-PASS1_BITS);
3047
3048 /* Odd part */
3049
3050 tmp10 = tmp0 + tmp4;
3051 tmp11 = tmp1 - tmp3;
3052 dataptr[5] = PASS1_OUTPUT(tmp10 - tmp11 - tmp2);
3053 tmp2 <<= CONST_BITS;
3054 dataptr[1] = (DCTELEM)
3055 DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
3056 MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
3057 MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
3058 MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
3059 CONST_BITS-PASS1_BITS);
3060 tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
3061 MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
3062 tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
3063 (tmp11 << (CONST_BITS - 1)) - tmp2;
3064 dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
3065 dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
3066
3067 dataptr += DCTSIZE; /* advance pointer to next row */
3068 }
3069
3070 /* Pass 2: process columns.
3071 * We apply the PASS2_BITS scaling, but leave the
3072 * results scaled up by an overall factor of 8.
3073 * We must also scale the output by (8/10)*(8/5) = 32/25,
3074 * which we fold into the constant multipliers:
3075 * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
3076 */
3077
3078 dataptr = data;
3079 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3080 /* Even part */
3081
3082 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
3083 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
3084 tmp2 = dataptr[DCTSIZE*2];
3085
3086 tmp10 = tmp0 + tmp1;
3087 tmp11 = tmp0 - tmp1;
3088
3089 tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
3090 tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
3091
3092 dataptr[DCTSIZE*0] = (DCTELEM)
3093 DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
3094 CONST_BITS+PASS2_BITS);
3095 tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
3096 tmp10 -= tmp2 << 2;
3097 tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
3098 dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS2_BITS);
3099 dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS2_BITS);
3100
3101 /* Odd part */
3102
3103 tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
3104
3105 dataptr[DCTSIZE*1] = (DCTELEM)
3106 DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
3107 CONST_BITS+PASS2_BITS);
3108 dataptr[DCTSIZE*3] = (DCTELEM)
3109 DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
3110 CONST_BITS+PASS2_BITS);
3111
3112 dataptr++; /* advance pointer to next column */
3113 }
3114}
3115
3116
3117/*
3118 * Perform the forward DCT on an 8x4 sample block.
3119 *
3120 * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
3121 */
3122
3123GLOBAL(void)
3125{
3126 INT32 tmp0, tmp1, tmp2, tmp3;
3127 INT32 tmp10, tmp11, tmp12, tmp13;
3128 INT32 z1;
3130 JSAMPROW elemptr;
3131 int ctr;
3133
3134 /* Zero 4 bottom rows of output coefficient block. */
3136
3137 /* Pass 1: process rows.
3138 * Note results are scaled up by sqrt(8) compared to a true DCT;
3139 * furthermore, we scale the results by 2**PASS1_BITS.
3140 * We must also scale the output by 8/4 = 2, which we add here.
3141 * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3142 */
3143
3144 dataptr = data;
3145 for (ctr = 0; ctr < 4; ctr++) {
3146 elemptr = sample_data[ctr] + start_col;
3147
3148 /* Even part per LL&M figure 1 --- note that published figure is faulty;
3149 * rotator "c1" should be "c6".
3150 */
3151
3152 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3153 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3154 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3155 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3156
3157 tmp10 = tmp0 + tmp3;
3158 tmp12 = tmp0 - tmp3;
3159 tmp11 = tmp1 + tmp2;
3160 tmp13 = tmp1 - tmp2;
3161
3162 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3163 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3164 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3165 tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3166
3167 /* Apply unsigned->signed conversion. */
3168 dataptr[0] = (DCTELEM)
3169 ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
3170 dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
3171
3172 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
3173 /* Add fudge factor here for final descale. */
3174 z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3175
3176 dataptr[2] = (DCTELEM)
3177 RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3178 CONST_BITS-PASS1_BITS-1);
3179 dataptr[6] = (DCTELEM)
3180 RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3181 CONST_BITS-PASS1_BITS-1);
3182
3183 /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3184 * i0..i3 in the paper are tmp0..tmp3 here.
3185 */
3186
3187 tmp12 = tmp0 + tmp2;
3188 tmp13 = tmp1 + tmp3;
3189
3190 z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
3191 /* Add fudge factor here for final descale. */
3192 z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3193
3194 tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
3195 tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
3196 tmp12 += z1;
3197 tmp13 += z1;
3198
3199 z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
3200 tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
3201 tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
3202 tmp0 += z1 + tmp12;
3203 tmp3 += z1 + tmp13;
3204
3205 z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
3206 tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
3207 tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
3208 tmp1 += z1 + tmp13;
3209 tmp2 += z1 + tmp12;
3210
3211 dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS-1);
3212 dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS-1);
3213 dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS-1);
3214 dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS-1);
3215
3216 dataptr += DCTSIZE; /* advance pointer to next row */
3217 }
3218
3219 /* Pass 2: process columns.
3220 * We apply the PASS2_BITS scaling, but leave the
3221 * results scaled up by an overall factor of 8.
3222 * 4-point FDCT kernel,
3223 * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3224 */
3225
3226 dataptr = data;
3227 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3228 /* Even part */
3229
3230 /* Add fudge factor here for final descale. */
3231#if PASS2_BITS > 1
3232 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS2_BITS-1));
3233#else
3234#if PASS2_BITS > 0
3235 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + ONE;
3236#else
3237 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
3238#endif
3239#endif
3240 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
3241
3242 tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
3243 tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
3244
3245 dataptr[DCTSIZE*0] = PASS2_OUTPUT(tmp0 + tmp1);
3246 dataptr[DCTSIZE*2] = PASS2_OUTPUT(tmp0 - tmp1);
3247
3248 /* Odd part */
3249
3250 tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
3251 /* Add fudge factor here for final descale. */
3252 tmp0 += ONE << (CONST_BITS+PASS2_BITS-1);
3253
3254 dataptr[DCTSIZE*1] = (DCTELEM)
3255 RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3256 CONST_BITS+PASS2_BITS);
3257 dataptr[DCTSIZE*3] = (DCTELEM)
3258 RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3259 CONST_BITS+PASS2_BITS);
3260
3261 dataptr++; /* advance pointer to next column */
3262 }
3263}
3264
3265
3266/*
3267 * Perform the forward DCT on a 6x3 sample block.
3268 *
3269 * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
3270 */
3271
3272GLOBAL(void)
3274{
3275 INT32 tmp0, tmp1, tmp2;
3276 INT32 tmp10, tmp11, tmp12;
3278 JSAMPROW elemptr;
3279 int ctr;
3281
3282 /* Pre-zero output coefficient block. */
3284
3285 /* Pass 1: process rows.
3286 * Note results are scaled up by sqrt(8) compared to a true DCT;
3287 * furthermore, we scale the results by 2**PASS1_BITS.
3288 * We scale the results further by 2 as part of output adaption
3289 * scaling for different DCT size.
3290 * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3291 */
3292
3293 dataptr = data;
3294 for (ctr = 0; ctr < 3; ctr++) {
3295 elemptr = sample_data[ctr] + start_col;
3296
3297 /* Even part */
3298
3299 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3300 tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3301 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3302
3303 tmp10 = tmp0 + tmp2;
3304 tmp12 = tmp0 - tmp2;
3305
3306 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3307 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3308 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3309
3310 /* Apply unsigned->signed conversion. */
3311 dataptr[0] = (DCTELEM)
3312 ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
3313 dataptr[2] = (DCTELEM)
3314 DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
3315 CONST_BITS-PASS1_BITS-1);
3316 dataptr[4] = (DCTELEM)
3317 DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3318 CONST_BITS-PASS1_BITS-1);
3319
3320 /* Odd part */
3321
3322 tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
3323 CONST_BITS-PASS1_BITS-1);
3324
3325 dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
3326 dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
3327 dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
3328
3329 dataptr += DCTSIZE; /* advance pointer to next row */
3330 }
3331
3332 /* Pass 2: process columns.
3333 * We apply the PASS2_BITS scaling, but leave the
3334 * results scaled up by an overall factor of 8.
3335 * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
3336 * fold into the constant multipliers (other part was done in pass 1):
3337 * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
3338 */
3339
3340 dataptr = data;
3341 for (ctr = 0; ctr < 6; ctr++) {
3342 /* Even part */
3343
3344 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
3345 tmp1 = dataptr[DCTSIZE*1];
3346
3347 tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
3348
3349 dataptr[DCTSIZE*0] = (DCTELEM)
3350 DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
3351 CONST_BITS+PASS2_BITS);
3352 dataptr[DCTSIZE*2] = (DCTELEM)
3353 DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
3354 CONST_BITS+PASS2_BITS);
3355
3356 /* Odd part */
3357
3358 dataptr[DCTSIZE*1] = (DCTELEM)
3359 DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
3360 CONST_BITS+PASS2_BITS);
3361
3362 dataptr++; /* advance pointer to next column */
3363 }
3364}
3365
3366
3367/*
3368 * Perform the forward DCT on a 4x2 sample block.
3369 *
3370 * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
3371 */
3372
3373GLOBAL(void)
3375{
3376 DCTELEM tmp0, tmp2, tmp10, tmp12, tmp4, tmp5;
3377 INT32 tmp1, tmp3, tmp11, tmp13;
3378 INT32 z1, z2, z3;
3379 JSAMPROW elemptr;
3381#if PASS2_BITS > PASS1_BITS + 3
3383#endif
3384
3385 /* Pre-zero output coefficient block. */
3387
3388 /* Pass 1: process rows.
3389 * Note results are scaled up by sqrt(8) compared to a true DCT.
3390 * 4-point FDCT kernel,
3391 * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3392 */
3393
3394 /* Row 0 */
3395 elemptr = sample_data[0] + start_col;
3396
3397 /* Even part */
3398
3399 tmp4 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3400 tmp5 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3401
3402#if PASS2_BITS > PASS1_BITS + 3
3403 /* Add fudge factor here for final downscale. */
3404#if PASS2_BITS > PASS1_BITS + 4
3405 tmp4 += 1 << (PASS2_BITS-PASS1_BITS-3-1);
3406#else
3407 tmp4 += 1;
3408#endif
3409#endif
3410
3411 tmp0 = tmp4 + tmp5;
3412 tmp2 = tmp4 - tmp5;
3413
3414 /* Odd part */
3415
3416 z2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3417 z3 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3418
3419 z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
3420 /* Add fudge factor here for final descale. */
3421 z1 += ONE << (CONST_BITS+PASS2_BITS-PASS1_BITS-3-1);
3422 tmp1 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
3423 tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
3424
3425 /* Row 1 */
3426 elemptr = sample_data[1] + start_col;
3427
3428 /* Even part */
3429
3430 tmp4 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3431 tmp5 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3432
3433 tmp10 = tmp4 + tmp5;
3434 tmp12 = tmp4 - tmp5;
3435
3436 /* Odd part */
3437
3438 z2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3439 z3 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3440
3441 z1 = MULTIPLY(z2 + z3, FIX_0_541196100); /* c6 */
3442 tmp11 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
3443 tmp13 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
3444
3445 /* Pass 2: process columns.
3446 * We leave the results scaled up by an overall factor of 8.
3447 * We must also scale the output by (8/4)*(8/2) = 2**3.
3448 */
3449
3450 /* Column 0 */
3451 /* Apply unsigned->signed conversion. */
3452
3453#if PASS2_BITS < PASS1_BITS + 3
3454 data[DCTSIZE*0] =
3455 (tmp0 + tmp10 - 8 * CENTERJSAMPLE) << (3+PASS1_BITS-PASS2_BITS);
3456 data[DCTSIZE*1] = (tmp0 - tmp10) << (3+PASS1_BITS-PASS2_BITS);
3457
3458 /* Column 2 */
3459 data[DCTSIZE*0+2] = (tmp2 + tmp12) << (3+PASS1_BITS-PASS2_BITS);
3460 data[DCTSIZE*1+2] = (tmp2 - tmp12) << (3+PASS1_BITS-PASS2_BITS);
3461#else
3462#if PASS2_BITS == PASS1_BITS + 3
3463 data[DCTSIZE*0] = tmp0 + tmp10 - 8 * CENTERJSAMPLE;
3464 data[DCTSIZE*1] = tmp0 - tmp10;
3465
3466 /* Column 2 */
3467 data[DCTSIZE*0+2] = tmp2 + tmp12;
3468 data[DCTSIZE*1+2] = tmp2 - tmp12;
3469#else
3470 data[DCTSIZE*0] =
3471 IRIGHT_SHIFT(tmp0 + tmp10 - 8 * CENTERJSAMPLE,
3472 PASS2_BITS-PASS1_BITS-3);
3473 data[DCTSIZE*1] =
3474 IRIGHT_SHIFT(tmp0 - tmp10, PASS2_BITS-PASS1_BITS-3);
3475
3476 /* Column 2 */
3477 data[DCTSIZE*0+2] =
3478 IRIGHT_SHIFT(tmp2 + tmp12, PASS2_BITS-PASS1_BITS-3);
3479 data[DCTSIZE*1+2] =
3480 IRIGHT_SHIFT(tmp2 - tmp12, PASS2_BITS-PASS1_BITS-3);
3481#endif
3482#endif
3483
3484 /* Column 1 */
3485 data[DCTSIZE*0+1] = (DCTELEM)
3486 RIGHT_SHIFT(tmp1 + tmp11, CONST_BITS+PASS2_BITS-PASS1_BITS-3);
3487 data[DCTSIZE*1+1] = (DCTELEM)
3488 RIGHT_SHIFT(tmp1 - tmp11, CONST_BITS+PASS2_BITS-PASS1_BITS-3);
3489
3490 /* Column 3 */
3491 data[DCTSIZE*0+3] = (DCTELEM)
3492 RIGHT_SHIFT(tmp3 + tmp13, CONST_BITS+PASS2_BITS-PASS1_BITS-3);
3493 data[DCTSIZE*1+3] = (DCTELEM)
3494 RIGHT_SHIFT(tmp3 - tmp13, CONST_BITS+PASS2_BITS-PASS1_BITS-3);
3495}
3496
3497
3498/*
3499 * Perform the forward DCT on a 2x1 sample block.
3500 *
3501 * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
3502 */
3503
3504GLOBAL(void)
3506{
3507 DCTELEM tmp0, tmp1;
3508 JSAMPROW elemptr;
3509
3510 /* Pre-zero output coefficient block. */
3512
3513 elemptr = sample_data[0] + start_col;
3514
3515 tmp0 = GETJSAMPLE(elemptr[0]);
3516 tmp1 = GETJSAMPLE(elemptr[1]);
3517
3518 /* We leave the results scaled up by an overall factor of 8.
3519 * We must also scale the output by (8/2)*(8/1) = 2**5.
3520 */
3521
3522 /* Even part */
3523
3524 /* Apply unsigned->signed conversion. */
3525 data[0] =
3526 (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << (5+PASS1_BITS-PASS2_BITS);
3527
3528 /* Odd part */
3529
3530 data[1] = (tmp0 - tmp1) << (5+PASS1_BITS-PASS2_BITS);
3531}
3532
3533
3534/*
3535 * Perform the forward DCT on an 8x16 sample block.
3536 *
3537 * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
3538 */
3539
3540GLOBAL(void)
3542{
3543 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3544 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
3545 INT32 z1;
3546 DCTELEM workspace[DCTSIZE2];
3548 DCTELEM *wsptr;
3549 JSAMPROW elemptr;
3550 int ctr;
3552
3553 /* Pass 1: process rows.
3554 * Note results are scaled up by sqrt(8) compared to a true DCT;
3555 * furthermore, we scale the results by 2**PASS1_BITS.
3556 * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3557 */
3558
3559 dataptr = data;
3560 ctr = 0;
3561 for (;;) {
3562 elemptr = sample_data[ctr] + start_col;
3563
3564 /* Even part per LL&M figure 1 --- note that published figure is faulty;
3565 * rotator "c1" should be "c6".
3566 */
3567
3568 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3569 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3570 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3571 tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3572
3573 tmp10 = tmp0 + tmp3;
3574 tmp12 = tmp0 - tmp3;
3575 tmp11 = tmp1 + tmp2;
3576 tmp13 = tmp1 - tmp2;
3577
3578 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3579 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3580 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3581 tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3582
3583 /* Apply unsigned->signed conversion. */
3584 dataptr[0] = PASS1_OUTPUT(tmp10 + tmp11 - 8 * CENTERJSAMPLE);
3585 dataptr[4] = PASS1_OUTPUT(tmp10 - tmp11);
3586
3587 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
3588 /* Add fudge factor here for final descale. */
3589 z1 += ONE << (CONST_BITS-PASS1_BITS-1);
3590
3591 dataptr[2] = (DCTELEM)
3592 RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3593 CONST_BITS-PASS1_BITS);
3594 dataptr[6] = (DCTELEM)
3595 RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3596 CONST_BITS-PASS1_BITS);
3597
3598 /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3599 * i0..i3 in the paper are tmp0..tmp3 here.
3600 */
3601
3602 tmp12 = tmp0 + tmp2;
3603 tmp13 = tmp1 + tmp3;
3604
3605 z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
3606 /* Add fudge factor here for final descale. */
3607 z1 += ONE << (CONST_BITS-PASS1_BITS-1);
3608
3609 tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
3610 tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
3611 tmp12 += z1;
3612 tmp13 += z1;
3613
3614 z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
3615 tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
3616 tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
3617 tmp0 += z1 + tmp12;
3618 tmp3 += z1 + tmp13;
3619
3620 z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
3621 tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
3622 tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
3623 tmp1 += z1 + tmp13;
3624 tmp2 += z1 + tmp12;
3625
3626 dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS);
3627 dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS);
3628 dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
3629 dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS);
3630
3631 ctr++;
3632
3633 if (ctr != DCTSIZE) {
3634 if (ctr == DCTSIZE * 2)
3635 break; /* Done. */
3636 dataptr += DCTSIZE; /* advance pointer to next row */
3637 } else
3638 dataptr = workspace; /* switch pointer to extended workspace */
3639 }
3640
3641 /* Pass 2: process columns.
3642 * We apply the PASS2_BITS scaling, but leave the
3643 * results scaled up by an overall factor of 8.
3644 * We must also scale the output by 8/16 = 1/2.
3645 * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
3646 */
3647
3648 dataptr = data;
3649 wsptr = workspace;
3650 for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3651 /* Even part */
3652
3653 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
3654 tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
3655 tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
3656 tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
3657 tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
3658 tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
3659 tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
3660 tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
3661
3662 tmp10 = tmp0 + tmp7;
3663 tmp14 = tmp0 - tmp7;
3664 tmp11 = tmp1 + tmp6;
3665 tmp15 = tmp1 - tmp6;
3666 tmp12 = tmp2 + tmp5;
3667 tmp16 = tmp2 - tmp5;
3668 tmp13 = tmp3 + tmp4;
3669 tmp17 = tmp3 - tmp4;
3670
3671 tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
3672 tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
3673 tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
3674 tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
3675 tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
3676 tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
3677 tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
3678 tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
3679
3680 dataptr[DCTSIZE*0] = (DCTELEM)
3681#if PASS2_BITS > 0
3682 RIGHT_SHIFT(tmp10 + tmp11 + tmp12 + tmp13 + (ONE << PASS2_BITS),
3683 PASS2_BITS+1);
3684#else
3685 RIGHT_SHIFT(tmp10 + tmp11 + tmp12 + tmp13 + ONE, 1);
3686#endif
3687 dataptr[DCTSIZE*4] = (DCTELEM)
3688 DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
3689 MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
3690 CONST_BITS+PASS2_BITS+1);
3691
3692 tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
3693 MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
3694
3695 dataptr[DCTSIZE*2] = (DCTELEM)
3696 DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
3697 + MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
3698 CONST_BITS+PASS2_BITS+1);
3699 dataptr[DCTSIZE*6] = (DCTELEM)
3700 DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
3701 - MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
3702 CONST_BITS+PASS2_BITS+1);
3703
3704 /* Odd part */
3705
3706 tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
3707 MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
3708 tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
3709 MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
3710 tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
3711 MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
3712 tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
3713 MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
3714 tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
3715 MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
3716 tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
3717 MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
3718 tmp10 = tmp11 + tmp12 + tmp13 -
3719 MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
3720 MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
3721 tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
3722 - MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
3723 tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
3724 + MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
3725 tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
3726 + MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
3727
3728 dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS2_BITS+1);
3729 dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS2_BITS+1);
3730 dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS2_BITS+1);
3731 dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS2_BITS+1);
3732
3733 dataptr++; /* advance pointer to next column */
3734 wsptr++; /* advance pointer to next column */
3735 }
3736}
3737
3738
3739/*
3740 * Perform the forward DCT on a 7x14 sample block.
3741 *
3742 * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
3743 */
3744
3745GLOBAL(void)
3747{
3748 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
3749 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3750 INT32 z1, z2, z3;
3751 DCTELEM workspace[8*6];
3753 DCTELEM *wsptr;
3754 JSAMPROW elemptr;
3755 int ctr;
3757
3758 /* Pre-zero output coefficient block. */
3760
3761 /* Pass 1: process rows.
3762 * Note results are scaled up by sqrt(8) compared to a true DCT;
3763 * furthermore, we scale the results by 2**PASS1_BITS.
3764 * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
3765 */
3766
3767 dataptr = data;
3768 ctr = 0;
3769 for (;;) {
3770 elemptr = sample_data[ctr] + start_col;
3771
3772 /* Even part */
3773
3774 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
3775 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
3776 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
3777 tmp3 = GETJSAMPLE(elemptr[3]);
3778
3779 tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
3780 tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
3781 tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
3782
3783 z1 = tmp0 + tmp2;
3784 /* Apply unsigned->signed conversion. */
3785 dataptr[0] = PASS1_OUTPUT(z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE);
3786 tmp3 += tmp3;
3787 z1 -= tmp3;
3788 z1 -= tmp3;
3789 z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
3790 z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
3791 z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
3792 dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
3793 z1 -= z2;
3794 z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
3795 dataptr[4] = (DCTELEM)
3796 DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
3797 CONST_BITS-PASS1_BITS);
3798 dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
3799
3800 /* Odd part */
3801
3802 tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
3803 tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
3804 tmp0 = tmp1 - tmp2;
3805 tmp1 += tmp2;
3806 tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
3807 tmp1 += tmp2;
3808 tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
3809 tmp0 += tmp3;
3810 tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
3811
3812 dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3813 dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3814 dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3815
3816 ctr++;
3817
3818 if (ctr != DCTSIZE) {
3819 if (ctr == 14)
3820 break; /* Done. */
3821 dataptr += DCTSIZE; /* advance pointer to next row */
3822 } else
3823 dataptr = workspace; /* switch pointer to extended workspace */
3824 }
3825
3826 /* Pass 2: process columns.
3827 * We apply the PASS2_BITS scaling, but leave the
3828 * results scaled up by an overall factor of 8.
3829 * We must also scale the output by (8/7)*(8/14) = 32/49,
3830 * which we fold into the constant multipliers:
3831 * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
3832 */
3833
3834 dataptr = data;
3835 wsptr = workspace;
3836 for (ctr = 0; ctr < 7; ctr++) {
3837 /* Even part */
3838
3839 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
3840 tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
3841 tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
3842 tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
3843 tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
3844 tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
3845 tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
3846
3847 tmp10 = tmp0 + tmp6;
3848 tmp14 = tmp0 - tmp6;
3849 tmp11 = tmp1 + tmp5;
3850 tmp15 = tmp1 - tmp5;
3851 tmp12 = tmp2 + tmp4;
3852 tmp16 = tmp2 - tmp4;
3853
3854 tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
3855 tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
3856 tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
3857 tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
3858 tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
3859 tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
3860 tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
3861
3862 dataptr[DCTSIZE*0] = (DCTELEM)
3863 DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
3864 FIX(0.653061224)), /* 32/49 */
3865 CONST_BITS+PASS2_BITS);
3866 tmp13 += tmp13;
3867 dataptr[DCTSIZE*4] = (DCTELEM)
3868 DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
3869 MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
3870 MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
3871 CONST_BITS+PASS2_BITS);
3872
3873 tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
3874
3875 dataptr[DCTSIZE*2] = (DCTELEM)
3876 DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
3877 + MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
3878 CONST_BITS+PASS2_BITS);
3879 dataptr[DCTSIZE*6] = (DCTELEM)
3880 DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
3881 - MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
3882 CONST_BITS+PASS2_BITS);
3883
3884 /* Odd part */
3885
3886 tmp10 = tmp1 + tmp2;
3887 tmp11 = tmp5 - tmp4;
3888 dataptr[DCTSIZE*7] = (DCTELEM)
3889 DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
3890 FIX(0.653061224)), /* 32/49 */
3891 CONST_BITS+PASS2_BITS);
3892 tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
3893 tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
3894 tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
3895 tmp10 += tmp11 - tmp3;
3896 tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
3897 MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
3898 dataptr[DCTSIZE*5] = (DCTELEM)
3899 DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
3900 + MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
3901 CONST_BITS+PASS2_BITS);
3902 tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
3903 MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
3904 dataptr[DCTSIZE*3] = (DCTELEM)
3905 DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
3906 - MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
3907 CONST_BITS+PASS2_BITS);
3908 dataptr[DCTSIZE*1] = (DCTELEM)
3909 DESCALE(tmp11 + tmp12 + tmp3
3910 - MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
3911 - MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
3912 CONST_BITS+PASS2_BITS);
3913
3914 dataptr++; /* advance pointer to next column */
3915 wsptr++; /* advance pointer to next column */
3916 }
3917}
3918
3919
3920/*
3921 * Perform the forward DCT on a 6x12 sample block.
3922 *
3923 * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
3924 */
3925
3926GLOBAL(void)
3928{
3929 INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3930 INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3931 DCTELEM workspace[8*4];
3933 DCTELEM *wsptr;
3934 JSAMPROW elemptr;
3935 int ctr;
3937
3938 /* Pre-zero output coefficient block. */
3940
3941 /* Pass 1: process rows.
3942 * Note results are scaled up by sqrt(8) compared to a true DCT;
3943 * furthermore, we scale the results by 2**PASS1_BITS.
3944 * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3945 */
3946
3947 dataptr = data;
3948 ctr = 0;
3949 for (;;) {
3950 elemptr = sample_data[ctr] + start_col;
3951
3952 /* Even part */
3953
3954 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3955 tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3956 tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3957
3958 tmp10 = tmp0 + tmp2;
3959 tmp12 = tmp0 - tmp2;
3960
3961 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3962 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3963 tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3964
3965 /* Apply unsigned->signed conversion. */
3966 dataptr[0] = PASS1_OUTPUT(tmp10 + tmp11 - 6 * CENTERJSAMPLE);
3967 dataptr[2] = (DCTELEM)
3968 DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
3969 CONST_BITS-PASS1_BITS);
3970 dataptr[4] = (DCTELEM)
3971 DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3972 CONST_BITS-PASS1_BITS);
3973
3974 /* Odd part */
3975
3976 tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
3977 CONST_BITS-PASS1_BITS);
3978
3979#if PASS1_BITS > 0
3980 dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
3981 dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
3982 dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
3983#else
3984 dataptr[1] = (DCTELEM) (tmp10 + tmp0 + tmp1);
3985 dataptr[3] = (DCTELEM) (tmp0 - tmp1 - tmp2);
3986 dataptr[5] = (DCTELEM) (tmp10 + tmp2 - tmp1);
3987#endif
3988
3989 ctr++;
3990
3991 if (ctr != DCTSIZE) {
3992 if (ctr == 12)
3993 break; /* Done. */
3994 dataptr += DCTSIZE; /* advance pointer to next row */
3995 } else
3996 dataptr = workspace; /* switch pointer to extended workspace */
3997 }
3998
3999 /* Pass 2: process columns.
4000 * We apply the PASS2_BITS scaling, but leave the
4001 * results scaled up by an overall factor of 8.
4002 * We must also scale the output by (8/6)*(8/12) = 8/9,
4003 * which we fold into the constant multipliers:
4004 * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
4005 */
4006
4007 dataptr = data;
4008 wsptr = workspace;
4009 for (ctr = 0; ctr < 6; ctr++) {
4010 /* Even part */
4011
4012 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
4013 tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
4014 tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
4015 tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
4016 tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
4017 tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
4018
4019 tmp10 = tmp0 + tmp5;
4020 tmp13 = tmp0 - tmp5;
4021 tmp11 = tmp1 + tmp4;
4022 tmp14 = tmp1 - tmp4;
4023 tmp12 = tmp2 + tmp3;
4024 tmp15 = tmp2 - tmp3;
4025
4026 tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
4027 tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
4028 tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
4029 tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
4030 tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
4031 tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
4032
4033 dataptr[DCTSIZE*0] = (DCTELEM)
4034 DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
4035 CONST_BITS+PASS2_BITS);
4036 dataptr[DCTSIZE*6] = (DCTELEM)
4037 DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
4038 CONST_BITS+PASS2_BITS);
4039 dataptr[DCTSIZE*4] = (DCTELEM)
4040 DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
4041 CONST_BITS+PASS2_BITS);
4042 dataptr[DCTSIZE*2] = (DCTELEM)
4043 DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
4044 MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
4045 CONST_BITS+PASS2_BITS);
4046
4047 /* Odd part */
4048
4049 tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
4050 tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
4051 tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
4052 tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
4053 tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
4054 tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
4055 + MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
4056 tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
4057 tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
4058 + MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
4059 tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
4060 - MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
4061 tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
4062 - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
4063
4064 dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS2_BITS);
4065 dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS2_BITS);
4066 dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS2_BITS);
4067 dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS2_BITS);
4068
4069 dataptr++; /* advance pointer to next column */
4070 wsptr++; /* advance pointer to next column */
4071 }
4072}
4073
4074
4075/*
4076 * Perform the forward DCT on a 5x10 sample block.
4077 *
4078 * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
4079 */
4080
4081GLOBAL(void)
4083{
4084 INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
4085 INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
4086 DCTELEM workspace[8*2];
4088 DCTELEM *wsptr;
4089 JSAMPROW elemptr;
4090 int ctr;
4092
4093 /* Pre-zero output coefficient block. */
4095
4096 /* Pass 1: process rows.
4097 * Note results are scaled up by sqrt(8) compared to a true DCT;
4098 * furthermore, we scale the results by 2**PASS1_BITS.
4099 * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
4100 */
4101
4102 dataptr = data;
4103 ctr = 0;
4104 for (;;) {
4105 elemptr = sample_data[ctr] + start_col;
4106
4107 /* Even part */
4108
4109 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
4110 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
4111 tmp2 = GETJSAMPLE(elemptr[2]);
4112
4113 tmp10 = tmp0 + tmp1;
4114 tmp11 = tmp0 - tmp1;
4115
4116 tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
4117 tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
4118
4119 /* Apply unsigned->signed conversion. */
4120 dataptr[0] = PASS1_OUTPUT(tmp10 + tmp2 - 5 * CENTERJSAMPLE);
4121 tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
4122 tmp10 -= tmp2 << 2;
4123 tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
4124 dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
4125 dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
4126
4127 /* Odd part */
4128
4129 tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
4130
4131 dataptr[1] = (DCTELEM)
4132 DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
4133 CONST_BITS-PASS1_BITS);
4134 dataptr[3] = (DCTELEM)
4135 DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
4136 CONST_BITS-PASS1_BITS);
4137
4138 ctr++;
4139
4140 if (ctr != DCTSIZE) {
4141 if (ctr == 10)
4142 break; /* Done. */
4143 dataptr += DCTSIZE; /* advance pointer to next row */
4144 } else
4145 dataptr = workspace; /* switch pointer to extended workspace */
4146 }
4147
4148 /* Pass 2: process columns.
4149 * We apply the PASS2_BITS scaling, but leave the
4150 * results scaled up by an overall factor of 8.
4151 * We must also scale the output by (8/5)*(8/10) = 32/25,
4152 * which we fold into the constant multipliers:
4153 * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
4154 */
4155
4156 dataptr = data;
4157 wsptr = workspace;
4158 for (ctr = 0; ctr < 5; ctr++) {
4159 /* Even part */
4160
4161 tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
4162 tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
4163 tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
4164 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
4165 tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
4166
4167 tmp10 = tmp0 + tmp4;
4168 tmp13 = tmp0 - tmp4;
4169 tmp11 = tmp1 + tmp3;
4170 tmp14 = tmp1 - tmp3;
4171
4172 tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
4173 tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
4174 tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
4175 tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
4176 tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
4177
4178 dataptr[DCTSIZE*0] = (DCTELEM)
4179 DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
4180 CONST_BITS+PASS2_BITS);
4181 tmp12 += tmp12;
4182 dataptr[DCTSIZE*4] = (DCTELEM)
4183 DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
4184 MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
4185 CONST_BITS+PASS2_BITS);
4186 tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
4187 dataptr[DCTSIZE*2] = (DCTELEM)
4188 DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
4189 CONST_BITS+PASS2_BITS);
4190 dataptr[DCTSIZE*6] = (DCTELEM)
4191 DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
4192 CONST_BITS+PASS2_BITS);
4193
4194 /* Odd part */
4195
4196 tmp10 = tmp0 + tmp4;
4197 tmp11 = tmp1 - tmp3;
4198 dataptr[DCTSIZE*5] = (DCTELEM)
4199 DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
4200 CONST_BITS+PASS2_BITS);
4201 tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
4202 dataptr[DCTSIZE*1] = (DCTELEM)
4203 DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
4204 MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
4205 MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
4206 MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
4207 CONST_BITS+PASS2_BITS);
4208 tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
4209 MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
4210 tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
4211 MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
4212 dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS2_BITS);
4213 dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS2_BITS);
4214
4215 dataptr++; /* advance pointer to next column */
4216 wsptr++; /* advance pointer to next column */
4217 }
4218}
4219
4220
4221/*
4222 * Perform the forward DCT on a 4x8 sample block.
4223 *
4224 * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
4225 */
4226
4227GLOBAL(void)
4229{
4230 INT32 tmp0, tmp1, tmp2, tmp3;
4231 INT32 tmp10, tmp11, tmp12, tmp13;
4232 INT32 z1;
4234 JSAMPROW elemptr;
4235 int ctr;
4237
4238 /* Pre-zero output coefficient block. */
4240
4241 /* Pass 1: process rows.
4242 * Note results are scaled up by sqrt(8) compared to a true DCT;
4243 * furthermore, we scale the results by 2**PASS1_BITS.
4244 * We must also scale the output by 8/4 = 2, which we add here.
4245 * 4-point FDCT kernel,
4246 * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4247 */
4248
4249 dataptr = data;
4250 for (ctr = 0; ctr < DCTSIZE; ctr++) {
4251 elemptr = sample_data[ctr] + start_col;
4252
4253 /* Even part */
4254
4255 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
4256 tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
4257
4258 tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
4259 tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
4260
4261 /* Apply unsigned->signed conversion. */
4262 dataptr[0] = (DCTELEM)
4263 ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
4264 dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
4265
4266 /* Odd part */
4267
4268 tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
4269 /* Add fudge factor here for final descale. */
4270 tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
4271
4272 dataptr[1] = (DCTELEM)
4273 RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4274 CONST_BITS-PASS1_BITS-1);
4275 dataptr[3] = (DCTELEM)
4276 RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4277 CONST_BITS-PASS1_BITS-1);
4278
4279 dataptr += DCTSIZE; /* advance pointer to next row */
4280 }
4281
4282 /* Pass 2: process columns.
4283 * We apply the PASS2_BITS scaling, but leave the
4284 * results scaled up by an overall factor of 8.
4285 * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4286 */
4287
4288 dataptr = data;
4289 for (ctr = 0; ctr < 4; ctr++) {
4290 /* Even part per LL&M figure 1 --- note that published figure is faulty;
4291 * rotator "c1" should be "c6".
4292 */
4293
4294 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
4295 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
4296 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
4297 tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
4298
4299 /* Add fudge factor here for final descale. */
4300#if PASS2_BITS > 1
4301 tmp10 = tmp0 + tmp3 + (ONE << (PASS2_BITS-1));
4302#else
4303#if PASS2_BITS > 0
4304 tmp10 = tmp0 + tmp3 + ONE;
4305#else
4306 tmp10 = tmp0 + tmp3;
4307#endif
4308#endif
4309 tmp12 = tmp0 - tmp3;
4310 tmp11 = tmp1 + tmp2;
4311 tmp13 = tmp1 - tmp2;
4312
4313 tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
4314 tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
4315 tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
4316 tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
4317
4318 dataptr[DCTSIZE*0] = PASS2_OUTPUT(tmp10 + tmp11);
4319 dataptr[DCTSIZE*4] = PASS2_OUTPUT(tmp10 - tmp11);
4320
4321 z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); /* c6 */
4322 /* Add fudge factor here for final descale. */
4323 z1 += ONE << (CONST_BITS+PASS2_BITS-1);
4324
4325 dataptr[DCTSIZE*2] = (DCTELEM)
4326 RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
4327 CONST_BITS+PASS2_BITS);
4328 dataptr[DCTSIZE*6] = (DCTELEM)
4329 RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
4330 CONST_BITS+PASS2_BITS);
4331
4332 /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
4333 * i0..i3 in the paper are tmp0..tmp3 here.
4334 */
4335
4336 tmp12 = tmp0 + tmp2;
4337 tmp13 = tmp1 + tmp3;
4338
4339 z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
4340 /* Add fudge factor here for final descale. */
4341 z1 += ONE << (CONST_BITS+PASS2_BITS-1);
4342
4343 tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* -c3+c5 */
4344 tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
4345 tmp12 += z1;
4346 tmp13 += z1;
4347
4348 z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
4349 tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
4350 tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
4351 tmp0 += z1 + tmp12;
4352 tmp3 += z1 + tmp13;
4353
4354 z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
4355 tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
4356 tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
4357 tmp1 += z1 + tmp13;
4358 tmp2 += z1 + tmp12;
4359
4360 dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS2_BITS);
4361 dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS2_BITS);
4362 dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS2_BITS);
4363 dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS2_BITS);
4364
4365 dataptr++; /* advance pointer to next column */
4366 }
4367}
4368
4369
4370/*
4371 * Perform the forward DCT on a 3x6 sample block.
4372 *
4373 * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
4374 */
4375
4376GLOBAL(void)
4378{
4379 INT32 tmp0, tmp1, tmp2;
4380 INT32 tmp10, tmp11, tmp12;
4382 JSAMPROW elemptr;
4383 int ctr;
4385
4386 /* Pre-zero output coefficient block. */
4388
4389 /* Pass 1: process rows.
4390 * Note results are scaled up by sqrt(8) compared to a true DCT;
4391 * furthermore, we scale the results by 2**PASS1_BITS.
4392 * We scale the results further by 2 as part of output adaption
4393 * scaling for different DCT size.
4394 * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
4395 */
4396
4397 dataptr = data;
4398 for (ctr = 0; ctr < 6; ctr++) {
4399 elemptr = sample_data[ctr] + start_col;
4400
4401 /* Even part */
4402
4403 tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
4404 tmp1 = GETJSAMPLE(elemptr[1]);
4405
4406 tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
4407
4408 /* Apply unsigned->signed conversion. */
4409 dataptr[0] = (DCTELEM)
4410 ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
4411 dataptr[2] = (DCTELEM)
4412 DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
4413 CONST_BITS-PASS1_BITS-1);
4414
4415 /* Odd part */
4416
4417 dataptr[1] = (DCTELEM)
4418 DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
4419 CONST_BITS-PASS1_BITS-1);
4420
4421 dataptr += DCTSIZE; /* advance pointer to next row */
4422 }
4423
4424 /* Pass 2: process columns.
4425 * We apply the PASS2_BITS scaling, but leave the
4426 * results scaled up by an overall factor of 8.
4427 * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
4428 * fold into the constant multipliers (other part was done in pass 1):
4429 * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
4430 */
4431
4432 dataptr = data;
4433 for (ctr = 0; ctr < 3; ctr++) {
4434 /* Even part */
4435
4436 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
4437 tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
4438 tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
4439
4440 tmp10 = tmp0 + tmp2;
4441 tmp12 = tmp0 - tmp2;
4442
4443 tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
4444 tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
4445 tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
4446
4447 dataptr[DCTSIZE*0] = (DCTELEM)
4448 DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
4449 CONST_BITS+PASS2_BITS);
4450 dataptr[DCTSIZE*2] = (DCTELEM)
4451 DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
4452 CONST_BITS+PASS2_BITS);
4453 dataptr[DCTSIZE*4] = (DCTELEM)
4454 DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
4455 CONST_BITS+PASS2_BITS);
4456
4457 /* Odd part */
4458
4459 tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
4460
4461 dataptr[DCTSIZE*1] = (DCTELEM)
4462 DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
4463 CONST_BITS+PASS2_BITS);
4464 dataptr[DCTSIZE*3] = (DCTELEM)
4465 DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
4466 CONST_BITS+PASS2_BITS);
4467 dataptr[DCTSIZE*5] = (DCTELEM)
4468 DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
4469 CONST_BITS+PASS2_BITS);
4470
4471 dataptr++; /* advance pointer to next column */
4472 }
4473}
4474
4475
4476/*
4477 * Perform the forward DCT on a 2x4 sample block.
4478 *
4479 * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
4480 */
4481
4482GLOBAL(void)
4484{
4485 INT32 tmp0, tmp1;
4486 INT32 tmp10, tmp11;
4488 JSAMPROW elemptr;
4489 int ctr;
4491
4492 /* Pre-zero output coefficient block. */
4494
4495 /* Pass 1: process rows.
4496 * Note results are scaled up by sqrt(8) compared to a true DCT.
4497 */
4498
4499 dataptr = data;
4500 for (ctr = 0; ctr < 4; ctr++) {
4501 elemptr = sample_data[ctr] + start_col;
4502
4503 /* Even part */
4504
4505 tmp0 = GETJSAMPLE(elemptr[0]);
4506 tmp1 = GETJSAMPLE(elemptr[1]);
4507
4508 /* Apply unsigned->signed conversion. */
4509 dataptr[0] = (DCTELEM) (tmp0 + tmp1 - 2 * CENTERJSAMPLE);
4510
4511 /* Odd part */
4512
4513 dataptr[1] = (DCTELEM) (tmp0 - tmp1);
4514
4515 dataptr += DCTSIZE; /* advance pointer to next row */
4516 }
4517
4518 /* Pass 2: process columns.
4519 * We leave the results scaled up by an overall factor of 8.
4520 * We must also scale the output by (8/2)*(8/4) = 2**3.
4521 * 4-point FDCT kernel,
4522 * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4523 */
4524
4525 dataptr = data;
4526 for (ctr = 0; ctr < 2; ctr++) {
4527 /* Even part */
4528
4529 tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
4530 tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
4531
4532 tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
4533 tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
4534
4535#if PASS2_BITS < PASS1_BITS + 3
4536 dataptr[DCTSIZE*0] = (DCTELEM)
4537 ((tmp0 + tmp1) << (3+PASS1_BITS-PASS2_BITS));
4538 dataptr[DCTSIZE*2] = (DCTELEM)
4539 ((tmp0 - tmp1) << (3+PASS1_BITS-PASS2_BITS));
4540#else
4541#if PASS2_BITS == PASS1_BITS + 3
4542 dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
4543 dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
4544#else
4545 /* Add fudge factor for descale. */
4546 tmp0 += ONE << (PASS2_BITS-PASS1_BITS-3-1);
4547
4548 dataptr[DCTSIZE*0] = (DCTELEM)
4549 RIGHT_SHIFT(tmp0 + tmp1, PASS2_BITS-PASS1_BITS-3);
4550 dataptr[DCTSIZE*2] = (DCTELEM)
4551 RIGHT_SHIFT(tmp0 - tmp1, PASS2_BITS-PASS1_BITS-3);
4552#endif
4553#endif
4554
4555 /* Odd part */
4556
4557 tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
4558 /* Add fudge factor for descale. */
4559 tmp0 += ONE << (CONST_BITS+PASS2_BITS-PASS1_BITS-3-1);
4560
4561 dataptr[DCTSIZE*1] = (DCTELEM)
4562 RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4563 CONST_BITS+PASS2_BITS-PASS1_BITS-3);
4564 dataptr[DCTSIZE*3] = (DCTELEM)
4565 RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4566 CONST_BITS+PASS2_BITS-PASS1_BITS-3);
4567
4568 dataptr++; /* advance pointer to next column */
4569 }
4570}
4571
4572
4573/*
4574 * Perform the forward DCT on a 1x2 sample block.
4575 *
4576 * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
4577 */
4578
4579GLOBAL(void)
4581{
4582 DCTELEM tmp0, tmp1;
4583
4584 /* Pre-zero output coefficient block. */
4586
4587 /* Pass 1: empty. */
4588
4589 /* Pass 2: process columns.
4590 * We leave the results scaled up by an overall factor of 8.
4591 * We must also scale the output by (8/1)*(8/2) = 2**5.
4592 */
4593
4594 /* Even part */
4595
4596 tmp0 = GETJSAMPLE(sample_data[0][start_col]);
4597 tmp1 = GETJSAMPLE(sample_data[1][start_col]);
4598
4599 /* Apply unsigned->signed conversion. */
4600 data[DCTSIZE*0] =
4601 (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << (5+PASS1_BITS-PASS2_BITS);
4602
4603 /* Odd part */
4604
4605 data[DCTSIZE*1] = (tmp0 - tmp1) << (5+PASS1_BITS-PASS2_BITS);
4606}
4607
4608#endif /* DCT_SCALING_SUPPORTED */
4609#endif /* DCT_ISLOW_SUPPORTED */
#define SIZEOF(_ar)
Definition: calc.h:97
GLint GLenum GLsizei GLsizei GLsizei GLint GLsizei const GLvoid * data
Definition: gl.h:1950
#define ISHIFT_TEMPS
Definition: jcarith.c:110
#define IRIGHT_SHIFT(x, shft)
Definition: jcarith.c:111
#define FIX(x)
Definition: jccolor.c:74
Sorry
Definition: jdcolor.c:19
#define ONE
Definition: jdct.h:365
JSAMPARRAY JDIMENSION start_col
Definition: jdct.h:183
int DCTELEM
Definition: jdct.h:49
JSAMPARRAY sample_data
Definition: jdct.h:183
unsigned int JDIMENSION
Definition: jmorecfg.h:265
#define CENTERJSAMPLE
Definition: jmorecfg.h:120
#define GLOBAL(type)
Definition: jmorecfg.h:327
#define GETJSAMPLE(value)
Definition: jmorecfg.h:114
#define SHIFT_TEMPS
Definition: jpegint.h:300
#define RIGHT_SHIFT(x, shft)
Definition: jpegint.h:301
#define DESCALE(x, n)
Definition: jpegint.h:309
#define DCTSIZE
Definition: jpeglib.h:50
int const JOCTET * dataptr
Definition: jpeglib.h:1033
JSAMPROW * JSAMPARRAY
Definition: jpeglib.h:76
JSAMPLE FAR * JSAMPROW
Definition: jpeglib.h:75
#define DCTSIZE2
Definition: jpeglib.h:51
static int blocks
Definition: mkdosfs.c:527
static double float double int float z1
Definition: server.c:81
static double float double int float double float int float z3
Definition: server.c:81
static double float double int float double float z2
Definition: server.c:81
Definition: inflate.c:139
#define MEMZERO(addr, type, size)
Definition: svc_dg.c:324
int32_t INT32
Definition: typedefs.h:58