[calligra] /: Optimized vector composite ops by 1.5-2 times more

Sun Dec 2 15:45:56 UTC 2012

Git commit d6403916cb2973909ef234acda76992e1a31f787 by Dmitry Kazakov.
Committed on 02/12/2012 at 06:37.
Pushed by dkazakov into branch 'master'.

Optimized vector composite ops by 1.5-2 times more

Conversion Uint<->Float is quite expensive in comparison to
Int<->Float (2-2.5 times). This happens because of special code
that handles sign bit of the number. So discarding this bit with
conversion Uint->Int makes a huge speedup.

Now the vector version of the composition is 1.8-8.7 times faster
that the old version (weighted: 3.2 times).

Many thanks to Matthias Kretz for pointing this out!

CCMAIL:kimageshop at kde.org
CCMAIL:kretz at kde.org

M  +92   -0    krita/benchmarks/kis_composition_benchmark.cpp
M  +5    -0    krita/benchmarks/kis_composition_benchmark.h
M  +9    -9    libs/pigment/compositeops/KoStreamedMath.h

http://commits.kde.org/calligra/d6403916cb2973909ef234acda76992e1a31f787

diff --git a/krita/benchmarks/kis_composition_benchmark.cpp b/krita/benchmarks/kis_composition_benchmark.cpp
index b6b4a6b..9e3ef08 100644
--- a/krita/benchmarks/kis_composition_benchmark.cpp
+++ b/krita/benchmarks/kis_composition_benchmark.cpp
@@ -516,5 +516,97 @@ void KisCompositionBenchmark::benchmarkMemcpy()
     freeTiles(tiles, 0, 0);
 }
 
+void KisCompositionBenchmark::benchmarkUintFloat()
+{
+#ifdef HAVE_VC
+    const int vecSize = Vc::float_v::Size;
+
+    const int dataSize = 4096;
+    quint8 *iData = (quint8*) memalign(vecSize, dataSize);
+    float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
+
+    QBENCHMARK {
+        for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
+            // convert uint -> float directly, this causes
+            // static_cast helper be called
+            Vc::float_v b(Vc::uint_v(iData + i));
+            b.store(fData + i);
+        }
+    }
+
+    free(iData);
+    free(fData);
+#endif
+}
+
+void KisCompositionBenchmark::benchmarkUintIntFloat()
+{
+#ifdef HAVE_VC
+    const int vecSize = Vc::float_v::Size;
+
+    const int dataSize = 4096;
+    quint8 *iData = (quint8*) memalign(vecSize, dataSize);
+    float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
+
+    QBENCHMARK {
+        for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
+            // convert uint->int->float, that avoids special sign
+            // treating, and gives 2.6 times speedup
+            Vc::float_v b(Vc::int_v(Vc::uint_v(iData + i)));
+            b.store(fData + i);
+        }
+    }
+
+    free(iData);
+    free(fData);
+#endif
+}
+
+void KisCompositionBenchmark::benchmarkFloatUint()
+{
+#ifdef HAVE_VC
+    const int vecSize = Vc::float_v::Size;
+
+    const int dataSize = 4096;
+    quint32 *iData = (quint32*) memalign(vecSize * 4, dataSize * 4);
+    float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
+
+    QBENCHMARK {
+        for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
+            // conversion float -> uint
+            Vc::uint_v b(Vc::float_v(fData + i));
+
+            b.store(iData + i);
+        }
+    }
+
+    free(iData);
+    free(fData);
+#endif
+}
+
+void KisCompositionBenchmark::benchmarkFloatIntUint()
+{
+#ifdef HAVE_VC
+    const int vecSize = Vc::float_v::Size;
+
+    const int dataSize = 4096;
+    quint32 *iData = (quint32*) memalign(vecSize * 4, dataSize * 4);
+    float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
+
+    QBENCHMARK {
+        for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
+            // conversion float -> int -> uint
+            Vc::uint_v b(Vc::int_v(Vc::float_v(fData + i)));
+
+            b.store(iData + i);
+        }
+    }
+
+    free(iData);
+    free(fData);
+#endif
+}
+
 QTEST_KDEMAIN(KisCompositionBenchmark, GUI)
 
diff --git a/krita/benchmarks/kis_composition_benchmark.h b/krita/benchmarks/kis_composition_benchmark.h
index ea5603d..9dabf65 100644
--- a/krita/benchmarks/kis_composition_benchmark.h
+++ b/krita/benchmarks/kis_composition_benchmark.h
@@ -43,6 +43,11 @@ private slots:
     void testRgb8CompositeOverReal_Aligned();
 
     void benchmarkMemcpy();
+
+    void benchmarkUintFloat();
+    void benchmarkUintIntFloat();
+    void benchmarkFloatUint();
+    void benchmarkFloatIntUint();
 };
 
 #endif /* __KIS_COMPOSITION_BENCHMARK_H */
diff --git a/libs/pigment/compositeops/KoStreamedMath.h b/libs/pigment/compositeops/KoStreamedMath.h
index 8a15a0a..9d2dfa8 100644
--- a/libs/pigment/compositeops/KoStreamedMath.h
+++ b/libs/pigment/compositeops/KoStreamedMath.h
@@ -94,7 +94,7 @@ static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha) {
  */
 inline Vc::float_v fetch_mask_8(const quint8 *data) {
     Vc::uint_v data_i(data);
-    return Vc::float_v(data_i);
+    return Vc::float_v(Vc::int_v(data_i));
 }
 
 /**
@@ -118,7 +118,7 @@ inline Vc::float_v fetch_alpha_32(const quint8 *data) {
         data_i.load((const quint32*)data, Vc::Unaligned);
     }
 
-    return Vc::float_v(data_i >> 24);
+    return Vc::float_v(Vc::int_v(data_i >> 24));
 }
 
 /**
@@ -148,9 +148,9 @@ inline void fetch_colors_32(const quint8 *data,
     const quint32 lowByteMask = 0xFF;
     Vc::uint_v mask(lowByteMask);
 
-    c1 = Vc::float_v((data_i >> 16) & mask);
-    c2 = Vc::float_v((data_i >> 8)  & mask);
-    c3 = Vc::float_v( data_i        & mask);
+    c1 = Vc::float_v(Vc::int_v((data_i >> 16) & mask));
+    c2 = Vc::float_v(Vc::int_v((data_i >> 8)  & mask));
+    c3 = Vc::float_v(Vc::int_v( data_i        & mask));
 }
 
 /**
@@ -175,11 +175,11 @@ inline void write_channels_32(quint8 *data,
     const quint32 lowByteMask = 0xFF;
     Vc::uint_v mask(lowByteMask);
 
-    Vc::uint_v v1 = Vc::uint_v(alpha) << 24;
-    Vc::uint_v v2 = (Vc::uint_v(c1) & mask) << 16;
-    Vc::uint_v v3 = (Vc::uint_v(c2) & mask) <<  8;
+    Vc::uint_v v1 = Vc::uint_v(Vc::int_v(alpha)) << 24;
+    Vc::uint_v v2 = (Vc::uint_v(Vc::int_v(c1)) & mask) << 16;
+    Vc::uint_v v3 = (Vc::uint_v(Vc::int_v(c2)) & mask) <<  8;
     v1 = v1 | v2;
-    Vc::uint_v v4 = Vc::uint_v(c3) & mask;
+    Vc::uint_v v4 = Vc::uint_v(Vc::int_v(c3)) & mask;
     v3 = v3 | v4;
 
     *((Vc::uint_v*)data) = v1 | v3;