27#define JPGD_SIMD_ALIGN(type, name) __declspec(align(16)) type name
29#define JPGD_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
33#define SHIFT_INV_ROW 16 - BITS_INV_ACC
34#define SHIFT_INV_COL 1 + BITS_INV_ACC
35const short IRND_INV_ROW = 1024 * (6 - BITS_INV_ACC);
36const short IRND_INV_COL = 16 * (BITS_INV_ACC - 3);
37const short IRND_INV_CORR = IRND_INV_COL - 1;
39JPGD_SIMD_ALIGN(
short, shortM128_one_corr[8]) = {1, 1, 1, 1, 1, 1, 1, 1};
40JPGD_SIMD_ALIGN(
short, shortM128_round_inv_row[8])
41 = {IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0};
42JPGD_SIMD_ALIGN(
short, shortM128_round_inv_col[8])
43 = {IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL,
44 IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL};
45JPGD_SIMD_ALIGN(
short, shortM128_round_inv_corr[8])
46 = {IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR,
47 IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR};
48JPGD_SIMD_ALIGN(
short, shortM128_tg_1_16[8])
49 = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036};
50JPGD_SIMD_ALIGN(
short, shortM128_tg_2_16[8])
51 = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146};
52JPGD_SIMD_ALIGN(
short, shortM128_tg_3_16[8])
53 = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746};
54JPGD_SIMD_ALIGN(
short, shortM128_cos_4_16[8])
55 = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};
63JPGD_SIMD_ALIGN(
short, shortM128_tab_i_04[])
64 = {16384, 21407, 16384, 8867, 16384, -8867, 16384, -21407,
65 16384, 8867, -16384, -21407,
66 -16384, 21407, 16384, -8867,
67 22725, 19266, 19266, -4520,
68 12873, -22725, 4520, -12873,
69 12873, 4520, -22725, -12873,
70 4520, 19266, 19266, -22725};
74JPGD_SIMD_ALIGN(
short, shortM128_tab_i_17[])
75 = {22725, 29692, 22725, 12299, 22725, -12299, 22725, -29692,
76 22725, 12299, -22725, -29692,
77 -22725, 29692, 22725, -12299,
78 31521, 26722, 26722, -6270,
79 17855, -31521, 6270, -17855,
80 17855, 6270, -31521, -17855,
81 6270, 26722, 26722, -31521};
85JPGD_SIMD_ALIGN(
short, shortM128_tab_i_26[])
86 = {21407, 27969, 21407, 11585, 21407, -11585, 21407, -27969,
87 21407, 11585, -21407, -27969,
88 -21407, 27969, 21407, -11585,
89 29692, 25172, 25172, -5906,
90 16819, -29692, 5906, -16819,
91 16819, 5906, -29692, -16819,
92 5906, 25172, 25172, -29692};
95JPGD_SIMD_ALIGN(
short, shortM128_tab_i_35[])
96 = {19266, 25172, 19266, 10426, 19266, -10426, 19266, -25172,
97 19266, 10426, -19266, -25172,
98 -19266, 25172, 19266, -10426,
99 26722, 22654, 22654, -5315,
100 15137, -26722, 5315, -15137,
101 15137, 5315, -26722, -15137,
102 5315, 22654, 22654, -26722};
104JPGD_SIMD_ALIGN(
short, shortM128_128[8]) = {128, 128, 128, 128, 128, 128, 128, 128};
106void idctSSEShortU8(
const short* pInput, uint8_t* pOutputUB)
108 __m128i r_xmm0, r_xmm4;
109 __m128i r_xmm1, r_xmm2, r_xmm3, r_xmm5, r_xmm6, r_xmm7;
110 __m128i row0, row1, row2, row3, row4, row5, row6, row7;
111 short* pTab_i_04 = shortM128_tab_i_04;
112 short* pTab_i_26 = shortM128_tab_i_26;
115 pTab_i_04 = shortM128_tab_i_04;
116 pTab_i_26 = shortM128_tab_i_26;
119 r_xmm0 = _mm_load_si128((__m128i*)pInput);
120 r_xmm4 = _mm_load_si128((__m128i*)(&pInput[2 * 8]));
125 r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
128 r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
131 r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
135 r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
140 r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
143 r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
147 r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
151 r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
154 r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
160 r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
163 r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
164 r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
165 r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
166 r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
167 r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
168 r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&shortM128_tab_i_26[0]));
169 r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
171 r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
172 r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&shortM128_tab_i_26[8]));
173 r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
174 r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
175 r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
176 r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&shortM128_tab_i_26[16]));
177 r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
178 r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
179 r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
180 r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&shortM128_tab_i_26[24]));
181 r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
183 r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
184 r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
185 row0 = _mm_packs_epi32(r_xmm0, r_xmm2);
186 r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
187 r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
188 r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
189 r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
190 r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
191 r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
192 row2 = _mm_packs_epi32(r_xmm4, r_xmm6);
195 r_xmm0 = _mm_load_si128((__m128i*)(&pInput[4 * 8]));
196 r_xmm4 = _mm_load_si128((__m128i*)(&pInput[6 * 8]));
198 r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
199 r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
200 r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
201 r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
202 r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
203 r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
204 r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
205 r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
206 r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
207 r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
208 r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
209 r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
210 r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
211 r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
212 r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
213 r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&shortM128_tab_i_26[0]));
214 r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
216 r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
217 r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&shortM128_tab_i_26[8]));
218 r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
219 r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
220 r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
221 r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&shortM128_tab_i_26[16]));
222 r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
223 r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
224 r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
225 r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&shortM128_tab_i_26[24]));
226 r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
228 r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
229 r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
230 row4 = _mm_packs_epi32(r_xmm0, r_xmm2);
231 r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
232 r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
233 r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
234 r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
235 r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
236 r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
237 row6 = _mm_packs_epi32(r_xmm4, r_xmm6);
240 pTab_i_04 = shortM128_tab_i_35;
241 pTab_i_26 = shortM128_tab_i_17;
242 r_xmm0 = _mm_load_si128((__m128i*)(&pInput[3 * 8]));
243 r_xmm4 = _mm_load_si128((__m128i*)(&pInput[1 * 8]));
245 r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
246 r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
247 r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
248 r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
249 r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
250 r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
251 r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
252 r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
253 r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
254 r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
255 r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
256 r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
257 r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
258 r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
259 r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
260 r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&pTab_i_26[0]));
261 r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
263 r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
264 r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&pTab_i_26[8]));
265 r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
266 r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
267 r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
268 r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&pTab_i_26[16]));
269 r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
270 r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
271 r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
272 r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&pTab_i_26[24]));
273 r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
275 r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
276 r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
277 row3 = _mm_packs_epi32(r_xmm0, r_xmm2);
278 r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
279 r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
280 r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
281 r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
282 r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
283 r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
284 row1 = _mm_packs_epi32(r_xmm4, r_xmm6);
287 r_xmm0 = _mm_load_si128((__m128i*)(&pInput[5 * 8]));
288 r_xmm4 = _mm_load_si128((__m128i*)(&pInput[7 * 8]));
290 r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
291 r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
292 r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
293 r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
294 r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
295 r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
296 r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
297 r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
298 r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
299 r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
300 r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
301 r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
302 r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
303 r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
304 r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
305 r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&pTab_i_26[0]));
306 r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
308 r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
309 r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&pTab_i_26[8]));
310 r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
311 r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
312 r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
313 r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&pTab_i_26[16]));
314 r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
315 r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
316 r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
317 r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&pTab_i_26[24]));
318 r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
320 r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
321 r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
322 row5 = _mm_packs_epi32(r_xmm0, r_xmm2);
323 r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
324 r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
325 r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
326 r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
327 r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
328 r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
329 row7 = _mm_packs_epi32(r_xmm4, r_xmm6);
331 r_xmm1 = _mm_load_si128((__m128i*)shortM128_tg_3_16);
334 r_xmm0 = _mm_mulhi_epi16(row5, r_xmm1);
336 r_xmm1 = _mm_mulhi_epi16(r_xmm1, r_xmm3);
337 r_xmm5 = _mm_load_si128((__m128i*)shortM128_tg_1_16);
339 r_xmm4 = _mm_mulhi_epi16(row7, r_xmm5);
341 r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm2);
342 r_xmm5 = _mm_mulhi_epi16(r_xmm5, row1);
343 r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm3);
346 r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm3);
347 r_xmm3 = _mm_load_si128((__m128i*)shortM128_tg_2_16);
348 r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm1);
349 r_xmm7 = _mm_mulhi_epi16(r_xmm7, r_xmm3);
351 r_xmm3 = _mm_mulhi_epi16(r_xmm3, row2);
352 r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm6);
353 r_xmm4 = _mm_adds_epi16(r_xmm4, row1);
354 r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm4);
355 r_xmm0 = _mm_adds_epi16(r_xmm0, *((__m128i*)shortM128_one_corr));
356 r_xmm4 = _mm_subs_epi16(r_xmm4, r_xmm1);
358 r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm2);
359 r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i*)shortM128_one_corr));
360 r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
363 __m128i temp3, temp7;
367 r_xmm0 = _mm_load_si128((__m128i*)shortM128_cos_4_16);
368 r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm5);
369 r_xmm2 = _mm_load_si128((__m128i*)shortM128_cos_4_16);
370 r_xmm2 = _mm_mulhi_epi16(r_xmm2, r_xmm4);
375 r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm5);
376 r_xmm7 = _mm_adds_epi16(r_xmm7, row2);
377 r_xmm3 = _mm_subs_epi16(r_xmm3, row6);
379 r_xmm0 = _mm_mulhi_epi16(r_xmm0, r_xmm1);
381 r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm6);
382 r_xmm6 = _mm_subs_epi16(r_xmm6, row4);
383 r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm2);
385 r_xmm4 = _mm_or_si128(r_xmm4, *((__m128i*)shortM128_one_corr));
386 r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm1);
387 r_xmm0 = _mm_or_si128(r_xmm0, *((__m128i*)shortM128_one_corr));
390 r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm7);
392 r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i*)shortM128_round_inv_col));
393 r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm7);
395 r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm3);
396 r_xmm6 = _mm_adds_epi16(r_xmm6, *((__m128i*)shortM128_round_inv_col));
397 r_xmm7 = _mm_adds_epi16(r_xmm7, r_xmm5);
398 r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
399 r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm3);
400 r_xmm1 = _mm_adds_epi16(r_xmm1, *((__m128i*)shortM128_round_inv_corr));
402 r_xmm2 = _mm_adds_epi16(r_xmm2, *((__m128i*)shortM128_round_inv_corr));
403 r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm4);
409 r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL);
411 r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm0);
417 r_xmm1 = _mm_srai_epi16(r_xmm1, SHIFT_INV_COL);
419 r_xmm7 = _mm_subs_epi16(r_xmm7, r_xmm0);
420 r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
426 r_xmm5 = _mm_subs_epi16(r_xmm5, temp7);
427 r_xmm5 = _mm_srai_epi16(r_xmm5, SHIFT_INV_COL);
433 r_xmm3 = _mm_subs_epi16(r_xmm3, r_xmm4);
434 r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
435 r_xmm2 = _mm_subs_epi16(r_xmm2, temp3);
436 r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL);
437 r_xmm2 = _mm_srai_epi16(r_xmm2, SHIFT_INV_COL);
443 r_xmm3 = _mm_srai_epi16(r_xmm3, SHIFT_INV_COL);
454 r0 = _mm_add_epi16(*(
const __m128i*)shortM128_128, r0);
455 r1 = _mm_add_epi16(*(
const __m128i*)shortM128_128, r1);
456 r2 = _mm_add_epi16(*(
const __m128i*)shortM128_128, r2);
457 r3 = _mm_add_epi16(*(
const __m128i*)shortM128_128, r3);
458 r4 = _mm_add_epi16(*(
const __m128i*)shortM128_128, r4);
459 r5 = _mm_add_epi16(*(
const __m128i*)shortM128_128, r5);
460 r6 = _mm_add_epi16(*(
const __m128i*)shortM128_128, r6);
461 r7 = _mm_add_epi16(*(
const __m128i*)shortM128_128, r7);
463 ((__m128i*)pOutputUB)[0] = _mm_packus_epi16(r0, r1);
464 ((__m128i*)pOutputUB)[1] = _mm_packus_epi16(r2, r3);
465 ((__m128i*)pOutputUB)[2] = _mm_packus_epi16(r4, r5);
466 ((__m128i*)pOutputUB)[3] = _mm_packus_epi16(r6, r7);