[calligra] /: Optimized vector composite ops by 1.5-2 times more
Dmitry Kazakov
dimula73 at gmail.com
Sun Dec 2 15:45:56 UTC 2012
Git commit d6403916cb2973909ef234acda76992e1a31f787 by Dmitry Kazakov.
Committed on 02/12/2012 at 06:37.
Pushed by dkazakov into branch 'master'.
Optimized vector composite ops by 1.5-2 times more
Conversion Uint<->Float is quite expensive in comparison to
Int<->Float (2-2.5 times). This happens because of special code
that handles sign bit of the number. So discarding this bit with
conversion Uint->Int makes a huge speedup.
Now the vector version of the composition is 1.8-8.7 times faster
that the old version (weighted: 3.2 times).
Many thanks to Matthias Kretz for pointing this out!
CCMAIL:kimageshop at kde.org
CCMAIL:kretz at kde.org
M +92 -0 krita/benchmarks/kis_composition_benchmark.cpp
M +5 -0 krita/benchmarks/kis_composition_benchmark.h
M +9 -9 libs/pigment/compositeops/KoStreamedMath.h
http://commits.kde.org/calligra/d6403916cb2973909ef234acda76992e1a31f787
diff --git a/krita/benchmarks/kis_composition_benchmark.cpp b/krita/benchmarks/kis_composition_benchmark.cpp
index b6b4a6b..9e3ef08 100644
--- a/krita/benchmarks/kis_composition_benchmark.cpp
+++ b/krita/benchmarks/kis_composition_benchmark.cpp
@@ -516,5 +516,97 @@ void KisCompositionBenchmark::benchmarkMemcpy()
freeTiles(tiles, 0, 0);
}
+void KisCompositionBenchmark::benchmarkUintFloat()
+{
+#ifdef HAVE_VC
+ const int vecSize = Vc::float_v::Size;
+
+ const int dataSize = 4096;
+ quint8 *iData = (quint8*) memalign(vecSize, dataSize);
+ float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
+
+ QBENCHMARK {
+ for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
+ // convert uint -> float directly, this causes
+ // static_cast helper be called
+ Vc::float_v b(Vc::uint_v(iData + i));
+ b.store(fData + i);
+ }
+ }
+
+ free(iData);
+ free(fData);
+#endif
+}
+
+void KisCompositionBenchmark::benchmarkUintIntFloat()
+{
+#ifdef HAVE_VC
+ const int vecSize = Vc::float_v::Size;
+
+ const int dataSize = 4096;
+ quint8 *iData = (quint8*) memalign(vecSize, dataSize);
+ float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
+
+ QBENCHMARK {
+ for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
+ // convert uint->int->float, that avoids special sign
+ // treating, and gives 2.6 times speedup
+ Vc::float_v b(Vc::int_v(Vc::uint_v(iData + i)));
+ b.store(fData + i);
+ }
+ }
+
+ free(iData);
+ free(fData);
+#endif
+}
+
+void KisCompositionBenchmark::benchmarkFloatUint()
+{
+#ifdef HAVE_VC
+ const int vecSize = Vc::float_v::Size;
+
+ const int dataSize = 4096;
+ quint32 *iData = (quint32*) memalign(vecSize * 4, dataSize * 4);
+ float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
+
+ QBENCHMARK {
+ for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
+ // conversion float -> uint
+ Vc::uint_v b(Vc::float_v(fData + i));
+
+ b.store(iData + i);
+ }
+ }
+
+ free(iData);
+ free(fData);
+#endif
+}
+
+void KisCompositionBenchmark::benchmarkFloatIntUint()
+{
+#ifdef HAVE_VC
+ const int vecSize = Vc::float_v::Size;
+
+ const int dataSize = 4096;
+ quint32 *iData = (quint32*) memalign(vecSize * 4, dataSize * 4);
+ float *fData = (float*) memalign(vecSize * 4, dataSize * 4);
+
+ QBENCHMARK {
+ for (int i = 0; i < dataSize; i += Vc::float_v::Size) {
+ // conversion float -> int -> uint
+ Vc::uint_v b(Vc::int_v(Vc::float_v(fData + i)));
+
+ b.store(iData + i);
+ }
+ }
+
+ free(iData);
+ free(fData);
+#endif
+}
+
QTEST_KDEMAIN(KisCompositionBenchmark, GUI)
diff --git a/krita/benchmarks/kis_composition_benchmark.h b/krita/benchmarks/kis_composition_benchmark.h
index ea5603d..9dabf65 100644
--- a/krita/benchmarks/kis_composition_benchmark.h
+++ b/krita/benchmarks/kis_composition_benchmark.h
@@ -43,6 +43,11 @@ private slots:
void testRgb8CompositeOverReal_Aligned();
void benchmarkMemcpy();
+
+ void benchmarkUintFloat();
+ void benchmarkUintIntFloat();
+ void benchmarkFloatUint();
+ void benchmarkFloatIntUint();
};
#endif /* __KIS_COMPOSITION_BENCHMARK_H */
diff --git a/libs/pigment/compositeops/KoStreamedMath.h b/libs/pigment/compositeops/KoStreamedMath.h
index 8a15a0a..9d2dfa8 100644
--- a/libs/pigment/compositeops/KoStreamedMath.h
+++ b/libs/pigment/compositeops/KoStreamedMath.h
@@ -94,7 +94,7 @@ static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha) {
*/
inline Vc::float_v fetch_mask_8(const quint8 *data) {
Vc::uint_v data_i(data);
- return Vc::float_v(data_i);
+ return Vc::float_v(Vc::int_v(data_i));
}
/**
@@ -118,7 +118,7 @@ inline Vc::float_v fetch_alpha_32(const quint8 *data) {
data_i.load((const quint32*)data, Vc::Unaligned);
}
- return Vc::float_v(data_i >> 24);
+ return Vc::float_v(Vc::int_v(data_i >> 24));
}
/**
@@ -148,9 +148,9 @@ inline void fetch_colors_32(const quint8 *data,
const quint32 lowByteMask = 0xFF;
Vc::uint_v mask(lowByteMask);
- c1 = Vc::float_v((data_i >> 16) & mask);
- c2 = Vc::float_v((data_i >> 8) & mask);
- c3 = Vc::float_v( data_i & mask);
+ c1 = Vc::float_v(Vc::int_v((data_i >> 16) & mask));
+ c2 = Vc::float_v(Vc::int_v((data_i >> 8) & mask));
+ c3 = Vc::float_v(Vc::int_v( data_i & mask));
}
/**
@@ -175,11 +175,11 @@ inline void write_channels_32(quint8 *data,
const quint32 lowByteMask = 0xFF;
Vc::uint_v mask(lowByteMask);
- Vc::uint_v v1 = Vc::uint_v(alpha) << 24;
- Vc::uint_v v2 = (Vc::uint_v(c1) & mask) << 16;
- Vc::uint_v v3 = (Vc::uint_v(c2) & mask) << 8;
+ Vc::uint_v v1 = Vc::uint_v(Vc::int_v(alpha)) << 24;
+ Vc::uint_v v2 = (Vc::uint_v(Vc::int_v(c1)) & mask) << 16;
+ Vc::uint_v v3 = (Vc::uint_v(Vc::int_v(c2)) & mask) << 8;
v1 = v1 | v2;
- Vc::uint_v v4 = Vc::uint_v(c3) & mask;
+ Vc::uint_v v4 = Vc::uint_v(Vc::int_v(c3)) & mask;
v3 = v3 | v4;
*((Vc::uint_v*)data) = v1 | v3;
More information about the kimageshop
mailing list