21
21
22
22
using namespace v8 ;
23
23
24
- // Windows doesn't support the C99 names for these
25
24
#ifdef _MSC_VER
26
- #define isnan (x ) _isnan(x)
27
- #define isinf (x ) (!_finite(x))
25
+ // Windows doesn't support the C99 names for these. TODO unnecessary,
26
+ // should be using std::isnan.
27
+ # define isnan (x ) _isnan(x)
28
+ # define isinf (x ) (!_finite(x))
29
+ # include < intrin.h>
30
+ # define bswap32 _byteswap_ulong
31
+ #else
32
+ # ifdef __x86_64__
33
+ # include < x86intrin.h>
34
+ # endif
35
+ # define bswap32 __builtin_bswap32
28
36
#endif
29
37
38
+ static inline uint32_t rotr (uint32_t n, unsigned int c) {
39
+ // GCC has no portable _rotr intrinsic, so rely on idiom recognition. Works
40
+ // for all supported versions of MSVC, GCC x86, GCC ARM, Clang.
41
+ // https://stackoverflow.com/a/776523/1218408
42
+ const unsigned int mask = CHAR_BIT * sizeof (n) - 1 ;
43
+ c &= mask;
44
+ return (n >> c) | (n << ((~c + 1 ) & mask));
45
+ }
46
+
30
47
#ifndef isnan
31
48
#define isnan (x ) std::isnan(x)
32
49
#define isinf (x ) std::isinf(x)
@@ -852,32 +869,71 @@ NAN_METHOD(Context2d::PutImageData) {
852
869
for (int y = 0 ; y < rows; ++y) {
853
870
uint8_t *dstRow = dst;
854
871
uint8_t *srcRow = src;
855
- for (int x = 0 ; x < cols; ++x) {
856
- // rgba
857
- uint8_t r = *srcRow++;
858
- uint8_t g = *srcRow++;
859
- uint8_t b = *srcRow++;
860
- uint8_t a = *srcRow++;
872
+ #if defined(__x86_64__) || defined(_M_X64)
873
+ int x = 0 ;
874
+ for (; x < cols - 1 ; x += 2 ) { // Two columns at a time
875
+ // Fast path if both alphas are 0.
876
+ uint64_t px64;
877
+ memcpy (&px64, srcRow, 8 );
878
+ if ((px64 & 0xFF000000'FF000000 ) == 0 ) {
879
+ memset (dstRow, 0 , 8 );
880
+ dstRow += 8 ;
881
+ srcRow += 8 ;
882
+ continue ;
883
+ }
861
884
862
- // argb
863
- // performance optimization: fully transparent/opaque pixels can be
864
- // processed more efficiently.
885
+ __m128i px;
886
+ memcpy (&px, srcRow, 8 ); // gcc doesn't define _mm_loadu_si64
887
+ px = _mm_unpacklo_epi8 (px, _mm_setzero_si128 ());
888
+ // rgba -> bgra
889
+ px = _mm_shufflelo_epi16 (px, 0b11000110 );
890
+ px = _mm_shufflehi_epi16 (px, 0b11000110 );
891
+
892
+ // Fast path if both alphas are 255.
893
+ const uint16_t aMask = 0b11000000'11000000 ;
894
+ const __m128i a255 = _mm_set_epi16 (0xFF , 0 , 0 , 0 , 0xFF , 0 , 0 , 0 );
895
+ __m128i aIs255 = _mm_cmpeq_epi16 (px, a255);
896
+ int aIs255k = _mm_movemask_epi8 (aIs255);
897
+ if ((aIs255k & aMask) != aMask) {
898
+ // broadcast alpha
899
+ __m128i av = _mm_shufflelo_epi16 (px, 0b11111111 );
900
+ av = _mm_shufflehi_epi16 (av, 0b11111111 );
901
+ // Multiply by alpha.
902
+ // Set alpha channel multiplier to 255 to undo upcoming division by 255
903
+ av = _mm_or_si128 (av, a255);
904
+ px = _mm_mullo_epi16 (px, av);
905
+ // divide by 255
906
+ px = _mm_mulhi_epu16 (px, _mm_set1_epi16 (0x8081 ));
907
+ px = _mm_srli_epi16 (px, 7 );
908
+ }
909
+
910
+ // pack int16 to int8
911
+ px = _mm_packus_epi16 (px, px);
912
+ memcpy (dstRow, &px, 8 );
913
+ dstRow += 8 ;
914
+ srcRow += 8 ;
915
+ }
916
+ if (cols & 1 ) {
917
+ #else
918
+ for (int x = 0 ; x < cols; x++) {
919
+ #endif
920
+ uint32_t c;
921
+ memcpy (&c, srcRow, 4 ); // rgba (LE)
922
+ srcRow += 4 ;
923
+ uint32_t a = c >> 24 ;
865
924
if (a == 0 ) {
866
- *dstRow++ = 0 ;
867
- *dstRow++ = 0 ;
868
- *dstRow++ = 0 ;
869
- *dstRow++ = 0 ;
870
- } else if (a == 255 ) {
871
- *dstRow++ = b;
872
- *dstRow++ = g;
873
- *dstRow++ = r;
874
- *dstRow++ = a;
925
+ uint32_t zero = 0 ;
926
+ memcpy (dstRow, &zero, 4 );
927
+ } else if (a == 255 ) { // rgba (LE)
928
+ c = bswap32 (c); // abgr
929
+ c = rotr (c, 8 ); // bgra
930
+ memcpy (dstRow, &c, 4 );
875
931
} else {
876
- float alpha = (float ) a / 255 ;
877
- *dstRow++ = b * alpha ;
878
- *dstRow++ = g * alpha ;
879
- *dstRow++ = r * alpha ;
880
- * dstRow++ = a ;
932
+ uint8_t r = (c & 0xFF ) * a / 255 ;
933
+ uint8_t g = (c >> 8 & 0xFF ) * a / 255 ;
934
+ uint8_t b = (c >> 16 & 0xFF ) * a / 255 ;
935
+ uint32_t bgra = (a << 24 ) | (r << 16 ) | (g << 8 ) | b ;
936
+ memcpy ( dstRow, &bgra, 4 ) ;
881
937
}
882
938
}
883
939
dst += dstStride;
@@ -892,13 +948,13 @@ NAN_METHOD(Context2d::PutImageData) {
892
948
uint8_t *dstRow = dst;
893
949
uint8_t *srcRow = src;
894
950
for (int x = 0 ; x < cols; ++x) {
895
- // rgba
951
+ // rgb[a]
896
952
uint8_t r = *srcRow++;
897
953
uint8_t g = *srcRow++;
898
954
uint8_t b = *srcRow++;
899
955
srcRow++;
900
956
901
- // argb
957
+ // bgra
902
958
*dstRow++ = b;
903
959
*dstRow++ = g;
904
960
*dstRow++ = r;
0 commit comments