1 #ifndef BMSSE_UTIL__H__INCLUDED__ 2 #define BMSSE_UTIL__H__INCLUDED__ 35 #pragma GCC diagnostic push 36 #pragma GCC diagnostic ignored "-Wconversion" 83 __m128i xM = _mm_set1_epi32((
int)mask);
86 _mm_store_si128(dst+0, _mm_xor_si128(_mm_load_si128(src+0), xM));
87 _mm_store_si128(dst+1, _mm_xor_si128(_mm_load_si128(src+1), xM));
88 _mm_store_si128(dst+2, _mm_xor_si128(_mm_load_si128(src+2), xM));
89 _mm_store_si128(dst+3, _mm_xor_si128(_mm_load_si128(src+3), xM));
91 }
while (src < src_end);
107 __m128i xM = _mm_set1_epi32((
int)mask);
110 _mm_store_si128(dst+0, _mm_andnot_si128(_mm_load_si128(src+0), xM));
111 _mm_store_si128(dst+1, _mm_andnot_si128(_mm_load_si128(src+1), xM));
112 _mm_store_si128(dst+2, _mm_andnot_si128(_mm_load_si128(src+2), xM));
113 _mm_store_si128(dst+3, _mm_andnot_si128(_mm_load_si128(src+3), xM));
115 }
while (src < src_end);
128 __m128i m1A, m1B, m1C, m1D;
129 __m128i accA, accB, accC, accD;
134 accA = accB = accC = accD = _mm_setzero_si128();
138 m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
139 m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
140 m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
141 m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
143 _mm_store_si128(dst+0, m1A);
144 _mm_store_si128(dst+1, m1B);
145 _mm_store_si128(dst+2, m1C);
146 _mm_store_si128(dst+3, m1D);
148 accA = _mm_or_si128(accA, m1A);
149 accB = _mm_or_si128(accB, m1B);
150 accC = _mm_or_si128(accC, m1C);
151 accD = _mm_or_si128(accD, m1D);
154 }
while (src < src_end);
156 accA = _mm_or_si128(accA, accB);
157 accC = _mm_or_si128(accC, accD);
158 accA = _mm_or_si128(accA, accC);
162 _mm_store_si128((__m128i*)macc, accA);
163 return macc[0] | macc[1] | macc[2] | macc[3];
179 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
180 __m128i accA, accB, accC, accD;
182 accA = _mm_setzero_si128();
183 accB = _mm_setzero_si128();
184 accC = _mm_setzero_si128();
185 accD = _mm_setzero_si128();
189 m1A = _mm_loadu_si128(src+0);
190 m2A = _mm_load_si128(dst+0);
191 m1A = _mm_and_si128(m1A, m2A);
192 _mm_store_si128(dst+0, m1A);
193 accA = _mm_or_si128(accA, m1A);
195 m1B = _mm_loadu_si128(src+1);
196 m2B = _mm_load_si128(dst+1);
197 m1B = _mm_and_si128(m1B, m2B);
198 _mm_store_si128(dst+1, m1B);
199 accB = _mm_or_si128(accB, m1B);
201 m1C = _mm_loadu_si128(src+2);
202 m2C = _mm_load_si128(dst+2);
203 m1C = _mm_and_si128(m1C, m2C);
204 _mm_store_si128(dst+2, m1C);
205 accC = _mm_or_si128(accC, m1C);
207 m1D = _mm_loadu_si128(src+3);
208 m2D = _mm_load_si128(dst+3);
209 m1D = _mm_and_si128(m1D, m2D);
210 _mm_store_si128(dst+3, m1D);
211 accD = _mm_or_si128(accD, m1D);
214 }
while (src < src_end);
216 accA = _mm_or_si128(accA, accB);
217 accC = _mm_or_si128(accC, accD);
218 accA = _mm_or_si128(accA, accC);
222 _mm_store_si128((__m128i*)macc, accA);
223 return macc[0] | macc[1] | macc[2] | macc[3];
232 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
233 __m128i accA, accB, accC, accD;
235 accA = _mm_setzero_si128();
236 accB = _mm_setzero_si128();
237 accC = _mm_setzero_si128();
238 accD = _mm_setzero_si128();
242 m1A = _mm_load_si128(src + 0);
243 m2A = _mm_load_si128(dst + 0);
244 m1A = _mm_and_si128(m1A, m2A);
245 _mm_store_si128(dst + 0, m1A);
246 accA = _mm_or_si128(accA, m1A);
248 m1B = _mm_load_si128(src + 1);
249 m2B = _mm_load_si128(dst + 1);
250 m1B = _mm_and_si128(m1B, m2B);
251 _mm_store_si128(dst + 1, m1B);
252 accB = _mm_or_si128(accB, m1B);
254 m1C = _mm_load_si128(src + 2);
255 m2C = _mm_load_si128(dst + 2);
256 m1C = _mm_and_si128(m1C, m2C);
257 _mm_store_si128(dst + 2, m1C);
258 accC = _mm_or_si128(accC, m1C);
260 m1D = _mm_load_si128(src + 3);
261 m2D = _mm_load_si128(dst + 3);
262 m1D = _mm_and_si128(m1D, m2D);
263 _mm_store_si128(dst + 3, m1D);
264 accD = _mm_or_si128(accD, m1D);
267 }
while (src < src_end);
269 accA = _mm_or_si128(accA, accB);
270 accC = _mm_or_si128(accC, accD);
271 accA = _mm_or_si128(accA, accC);
275 _mm_store_si128((__m128i*)macc, accA);
276 return macc[0] | macc[1] | macc[2] | macc[3];
291 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
292 __m128i mAccF0 = _mm_set1_epi32(~0u);
293 __m128i mAccF1 = _mm_set1_epi32(~0u);
299 m1A = _mm_load_si128(src + 0);
300 m2A = _mm_load_si128(dst + 0);
301 m1A = _mm_or_si128(m1A, m2A);
302 _mm_store_si128(dst + 0, m1A);
304 m1B = _mm_load_si128(src + 1);
305 m2B = _mm_load_si128(dst + 1);
306 m1B = _mm_or_si128(m1B, m2B);
307 _mm_store_si128(dst + 1, m1B);
309 m1C = _mm_load_si128(src + 2);
310 m2C = _mm_load_si128(dst + 2);
311 m1C = _mm_or_si128(m1C, m2C);
312 _mm_store_si128(dst + 2, m1C);
314 m1D = _mm_load_si128(src + 3);
315 m2D = _mm_load_si128(dst + 3);
316 m1D = _mm_or_si128(m1D, m2D);
317 _mm_store_si128(dst + 3, m1D);
319 mAccF1 = _mm_and_si128(mAccF1, m1C);
320 mAccF1 = _mm_and_si128(mAccF1, m1D);
321 mAccF0 = _mm_and_si128(mAccF0, m1A);
322 mAccF0 = _mm_and_si128(mAccF0, m1B);
325 }
while (src < src_end);
327 __m128i maskF = _mm_set1_epi32(~0u);
328 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
329 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
330 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
332 return (maskA == 0xFFFFu);
346 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
347 __m128i mAccF0 = _mm_set1_epi32(~0u);
348 __m128i mAccF1 = _mm_set1_epi32(~0u);
351 m1A = _mm_loadu_si128(src + 0);
352 m2A = _mm_load_si128(dst + 0);
353 m1A = _mm_or_si128(m1A, m2A);
354 _mm_store_si128(dst + 0, m1A);
356 m1B = _mm_loadu_si128(src + 1);
357 m2B = _mm_load_si128(dst + 1);
358 m1B = _mm_or_si128(m1B, m2B);
359 _mm_store_si128(dst + 1, m1B);
361 m1C = _mm_loadu_si128(src + 2);
362 m2C = _mm_load_si128(dst + 2);
363 m1C = _mm_or_si128(m1C, m2C);
364 _mm_store_si128(dst + 2, m1C);
366 m1D = _mm_loadu_si128(src + 3);
367 m2D = _mm_load_si128(dst + 3);
368 m1D = _mm_or_si128(m1D, m2D);
369 _mm_store_si128(dst + 3, m1D);
371 mAccF1 = _mm_and_si128(mAccF1, m1C);
372 mAccF1 = _mm_and_si128(mAccF1, m1D);
373 mAccF0 = _mm_and_si128(mAccF0, m1A);
374 mAccF0 = _mm_and_si128(mAccF0, m1B);
377 }
while (src < src_end);
379 __m128i maskF = _mm_set1_epi32(~0u);
380 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
381 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
382 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
383 return (maskA == 0xFFFFu);
398 __m128i m1A, m1B, m1C, m1D;
399 __m128i mAccF0 = _mm_set1_epi32(~0u);
400 __m128i mAccF1 = _mm_set1_epi32(~0u);
406 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
407 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
408 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
409 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
411 _mm_store_si128(dst + 0, m1A);
412 _mm_store_si128(dst + 1, m1B);
413 _mm_store_si128(dst + 2, m1C);
414 _mm_store_si128(dst + 3, m1D);
416 mAccF1 = _mm_and_si128(mAccF1, m1C);
417 mAccF1 = _mm_and_si128(mAccF1, m1D);
418 mAccF0 = _mm_and_si128(mAccF0, m1A);
419 mAccF0 = _mm_and_si128(mAccF0, m1B);
421 src1 += 4; src2 += 4; dst += 4;
423 }
while (src1 < src_end1);
425 __m128i maskF = _mm_set1_epi32(~0u);
426 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
427 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
428 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
429 return (maskA == 0xFFFFu);
444 __m128i m1A, m1B, m1C, m1D;
445 __m128i mAccF0 = _mm_set1_epi32(~0u);
446 __m128i mAccF1 = _mm_set1_epi32(~0u);
452 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
453 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
454 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
455 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
457 m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
458 m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
459 m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
460 m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
462 _mm_store_si128(dst + 0, m1A);
463 _mm_store_si128(dst + 1, m1B);
464 _mm_store_si128(dst + 2, m1C);
465 _mm_store_si128(dst + 3, m1D);
467 mAccF1 = _mm_and_si128(mAccF1, m1C);
468 mAccF1 = _mm_and_si128(mAccF1, m1D);
469 mAccF0 = _mm_and_si128(mAccF0, m1A);
470 mAccF0 = _mm_and_si128(mAccF0, m1B);
472 src1 += 4; src2 += 4; dst += 4;
474 }
while (src1 < src_end1);
476 __m128i maskF = _mm_set1_epi32(~0u);
477 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
478 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
479 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
480 return (maskA == 0xFFFFu);
497 __m128i m1A, m1B, m1C, m1D;
498 __m128i mAccF0 = _mm_set1_epi32(~0u);
499 __m128i mAccF1 = _mm_set1_epi32(~0u);
506 m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
507 m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
508 m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
509 m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
511 m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
512 m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
513 m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
514 m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
516 m1A = _mm_or_si128(m1A, _mm_load_si128(src3 + 0));
517 m1B = _mm_or_si128(m1B, _mm_load_si128(src3 + 1));
518 m1C = _mm_or_si128(m1C, _mm_load_si128(src3 + 2));
519 m1D = _mm_or_si128(m1D, _mm_load_si128(src3 + 3));
521 m1A = _mm_or_si128(m1A, _mm_load_si128(src4 + 0));
522 m1B = _mm_or_si128(m1B, _mm_load_si128(src4 + 1));
523 m1C = _mm_or_si128(m1C, _mm_load_si128(src4 + 2));
524 m1D = _mm_or_si128(m1D, _mm_load_si128(src4 + 3));
526 _mm_stream_si128(dst + 0, m1A);
527 _mm_stream_si128(dst + 1, m1B);
528 _mm_stream_si128(dst + 2, m1C);
529 _mm_stream_si128(dst + 3, m1D);
531 mAccF1 = _mm_and_si128(mAccF1, m1C);
532 mAccF1 = _mm_and_si128(mAccF1, m1D);
533 mAccF0 = _mm_and_si128(mAccF0, m1A);
534 mAccF0 = _mm_and_si128(mAccF0, m1B);
536 src1 += 4; src2 += 4;
537 src3 += 4; src4 += 4;
539 _mm_prefetch ((
const char*)src3, _MM_HINT_T0);
540 _mm_prefetch ((
const char*)src4, _MM_HINT_T0);
544 }
while (src1 < src_end1);
546 __m128i maskF = _mm_set1_epi32(~0u);
547 mAccF0 = _mm_and_si128(mAccF0, mAccF1);
548 __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
549 unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
550 return (maskA == 0xFFFFu);
565 __m128i m1A, m1B, m1C, m1D;
566 __m128i accA, accB, accC, accD;
571 accA = accB = accC = accD = _mm_setzero_si128();
575 m1A = _mm_xor_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
576 m1B = _mm_xor_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
577 m1C = _mm_xor_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
578 m1D = _mm_xor_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
580 _mm_store_si128(dst+0, m1A);
581 _mm_store_si128(dst+1, m1B);
582 _mm_store_si128(dst+2, m1C);
583 _mm_store_si128(dst+3, m1D);
585 accA = _mm_or_si128(accA, m1A);
586 accB = _mm_or_si128(accB, m1B);
587 accC = _mm_or_si128(accC, m1C);
588 accD = _mm_or_si128(accD, m1D);
591 }
while (src < src_end);
593 accA = _mm_or_si128(accA, accB);
594 accC = _mm_or_si128(accC, accD);
595 accA = _mm_or_si128(accA, accC);
598 _mm_store_si128((__m128i*)macc, accA);
599 return macc[0] | macc[1] | macc[2] | macc[3];
613 __m128i m1A, m1B, m1C, m1D;
614 __m128i accA, accB, accC, accD;
619 accA = accB = accC = accD = _mm_setzero_si128();
623 m1A = _mm_xor_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
624 m1B = _mm_xor_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
625 m1C = _mm_xor_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
626 m1D = _mm_xor_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
628 _mm_store_si128(dst + 0, m1A);
629 _mm_store_si128(dst + 1, m1B);
630 _mm_store_si128(dst + 2, m1C);
631 _mm_store_si128(dst + 3, m1D);
633 accA = _mm_or_si128(accA, m1A);
634 accB = _mm_or_si128(accB, m1B);
635 accC = _mm_or_si128(accC, m1C);
636 accD = _mm_or_si128(accD, m1D);
638 src1 += 4; src2 += 4; dst += 4;
639 }
while (src1 < src1_end);
641 accA = _mm_or_si128(accA, accB);
642 accC = _mm_or_si128(accC, accD);
643 accA = _mm_or_si128(accA, accC);
646 _mm_store_si128((__m128i*)macc, accA);
647 return macc[0] | macc[1] | macc[2] | macc[3];
663 __m128i m1A, m1B, m1C, m1D;
664 __m128i accA, accB, accC, accD;
666 accA = accB = accC = accD = _mm_setzero_si128();
673 m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
674 m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
675 m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
676 m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
678 _mm_store_si128(dst+0, m1A);
679 _mm_store_si128(dst+1, m1B);
680 _mm_store_si128(dst+2, m1C);
681 _mm_store_si128(dst+3, m1D);
683 accA = _mm_or_si128(accA, m1A);
684 accB = _mm_or_si128(accB, m1B);
685 accC = _mm_or_si128(accC, m1C);
686 accD = _mm_or_si128(accD, m1D);
689 }
while (src < src_end);
691 accA = _mm_or_si128(accA, accB);
692 accC = _mm_or_si128(accC, accD);
693 accA = _mm_or_si128(accA, accC);
697 _mm_store_si128((__m128i*)macc, accA);
698 return macc[0] | macc[1] | macc[2] | macc[3];
715 __m128i xmm0 = _mm_set1_epi32((
int)value);
718 _mm_store_si128(dst, xmm0);
719 _mm_store_si128(dst+1, xmm0);
720 _mm_store_si128(dst+2, xmm0);
721 _mm_store_si128(dst+3, xmm0);
723 _mm_store_si128(dst+4, xmm0);
724 _mm_store_si128(dst+5, xmm0);
725 _mm_store_si128(dst+6, xmm0);
726 _mm_store_si128(dst+7, xmm0);
729 }
while (dst < dst_end);
742 __m128i xmm0, xmm1, xmm2, xmm3;
748 xmm0 = _mm_load_si128(src+0);
749 xmm1 = _mm_load_si128(src+1);
750 xmm2 = _mm_load_si128(src+2);
751 xmm3 = _mm_load_si128(src+3);
753 _mm_store_si128(dst+0, xmm0);
754 _mm_store_si128(dst+1, xmm1);
755 _mm_store_si128(dst+2, xmm2);
756 _mm_store_si128(dst+3, xmm3);
758 xmm0 = _mm_load_si128(src+4);
759 xmm1 = _mm_load_si128(src+5);
760 xmm2 = _mm_load_si128(src+6);
761 xmm3 = _mm_load_si128(src+7);
763 _mm_store_si128(dst+4, xmm0);
764 _mm_store_si128(dst+5, xmm1);
765 _mm_store_si128(dst+6, xmm2);
766 _mm_store_si128(dst+7, xmm3);
770 }
while (src < src_end);
783 __m128i xmm0, xmm1, xmm2, xmm3;
789 xmm0 = _mm_load_si128(src+0);
790 xmm1 = _mm_load_si128(src+1);
791 xmm2 = _mm_load_si128(src+2);
792 xmm3 = _mm_load_si128(src+3);
794 _mm_stream_si128(dst+0, xmm0);
795 _mm_stream_si128(dst+1, xmm1);
796 _mm_stream_si128(dst+2, xmm2);
797 _mm_stream_si128(dst+3, xmm3);
799 xmm0 = _mm_load_si128(src+4);
800 xmm1 = _mm_load_si128(src+5);
801 xmm2 = _mm_load_si128(src+6);
802 xmm3 = _mm_load_si128(src+7);
804 _mm_stream_si128(dst+4, xmm0);
805 _mm_stream_si128(dst+5, xmm1);
806 _mm_stream_si128(dst+6, xmm2);
807 _mm_stream_si128(dst+7, xmm3);
811 }
while (src < src_end);
829 __m128i maskF = _mm_set1_epi32(~0u);
833 __m128i mA, mB, mC, mD;
836 mA = _mm_load_si128(dst + 0);
837 mA = _mm_xor_si128(mA, maskF);
838 _mm_store_si128(dst+0, mA);
840 mB = _mm_load_si128(dst + 1);
841 mB = _mm_xor_si128(mB, maskF);
842 _mm_store_si128(dst + 1, mB);
844 mC = _mm_load_si128(dst + 2);
845 mC = _mm_xor_si128(mC, maskF);
846 _mm_store_si128(dst + 2, mC);
848 mD = _mm_load_si128(dst + 3);
849 mD = _mm_xor_si128(mD, maskF);
850 _mm_store_si128(dst + 3, mD);
854 }
while (dst < (__m128i*)dst_end);
860 return _mm_and_si128(a, b);
866 return _mm_or_si128(a, b);
873 return _mm_xor_si128(a, b);
879 return _mm_andnot_si128(b, a);
896 unsigned sse_vect_waves,
899 __m128i xcnt = _mm_setzero_si128();
901 for (
unsigned i = 0; i < sse_vect_waves; ++i)
903 __m128i mm0 = _mm_loadu_si128((__m128i*)(pbuf - 1));
904 __m128i mm1 = _mm_loadu_si128((__m128i*)(pbuf + 8 - 1));
905 __m128i mm_s2 = _mm_add_epi16(mm1, mm0);
906 xcnt = _mm_add_epi16(xcnt, mm_s2);
909 xcnt = _mm_sub_epi16(_mm_srli_epi32(xcnt, 16), xcnt);
911 unsigned short* cnt8 = (
unsigned short*)&xcnt;
912 *sum += (cnt8[0]) + (cnt8[2]) + (cnt8[4]) + (cnt8[6]);
917 #pragma GCC diagnostic pop BMFORCEINLINE void sse2_copy_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SSE2 block copy dst = *src.
BMFORCEINLINE __m128i sse2_or(__m128i a, __m128i b)
BMFORCEINLINE __m128i sse2_xor(__m128i a, __m128i b)
const unsigned set_block_size
void sse2_invert_block(__m128i *dst)
Invert bit block dst = ~*dst or dst ^= *dst.
unsigned sse2_sub_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND-NOT (SUB) array elements against another array dst &= ~*src.
bool sse2_or_block_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
OR array elements against another 2 arrays dst |= *src1 | src2.
unsigned sse2_xor_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
3 operand XOR dst = *src1 ^ src2
bool sse2_or_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2)
OR 2 blocks anc copy result to the destination dst = *src1 | src2.
void sse2_andnot_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask)
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
SSE2 reinitialization guard class.
unsigned sse2_xor_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
XOR block against another dst ^= *src.
bool sse2_or_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end)
OR array elements against another array (unaligned) dst |= *src.
unsigned sse2_and_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end)
AND array elements against another array (unaligned) dst &= *src.
BMFORCEINLINE ~sse_empty_guard()
void sse2_xor_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask)
XOR array elements to specified mask dst = *src ^ mask.
BMFORCEINLINE void sse2_set_block(__m128i *BMRESTRICT dst, bm::word_t value)
SSE2 block memset dst = value.
unsigned short gap_word_t
bool sse2_or_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
OR array elements against another array dst |= *src.
bool sse2_or_block_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4)
OR array elements against another 2 arrays dst |= *src1 | src2 | src3 | src4.
BMFORCEINLINE __m128i sse2_and(__m128i a, __m128i b)
unsigned sse2_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
AND blocks2 dst &= *src.
BMFORCEINLINE sse_empty_guard()
const bm::gap_word_t * sse2_gap_sum_arr(const bm::gap_word_t *BMRESTRICT pbuf, unsigned sse_vect_waves, unsigned *sum)
Gap block population count (array sum) utility.
BMFORCEINLINE void sse2_stream_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src)
SSE2 block copy dst = *src.
BMFORCEINLINE __m128i sse2_sub(__m128i a, __m128i b)