[calligra/vector_compositioning_kazakov] /: Added the first version of per-architecture binaries for composition
Dmitry Kazakov
dimula73 at gmail.com
Sun Dec 2 15:46:14 UTC 2012
Git commit 7cf94c1af73cb7bf56365f27a5b77df32d862778 by Dmitry Kazakov.
Committed on 02/12/2012 at 16:44.
Pushed by dkazakov into branch 'vector_compositioning_kazakov'.
Added the first version of per-architecture binaries for composition
Pros:
+ we can have prebuild versions for all the architectures supported
by Vc (Amd XMA4 and XOP are not supported by Vc yet)
+ the implementation is chosen dynamically on Krita start
+ the semi-general code for multi-arch builds now in
KoVcMultiArchBuildSupport.h (might be ported upstream in the future)
Cons:
- it depends on Vc's 'staging' branch, so it can't be put in master
right now
- the code became much less readable due to all that template magic
- I had to copy-paste Vc's 'vc_compile_for_all_implementations' cmake
macro, because we do not need 'Scalar' implementation
- the size of the pigment library grew almost 1.5 times: 11->17 MiB
(probably, we still need plugin system for this)
CCMAIL:kimageshop at kde.org
CCMAIL:kretz at kde.org
M +2 -2 CMakeLists.txt
M +2 -2 krita/benchmarks/kis_composition_benchmark.cpp
M +55 -3 libs/pigment/CMakeLists.txt
M +15 -19 libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
M +29 -15 libs/pigment/compositeops/KoOptimizedCompositeOpFactory.cpp
A +101 -0 libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp [License: GPL (v2+)]
A +58 -0 libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h [License: GPL (v2+)]
M +19 -23 libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
M +17 -35 libs/pigment/compositeops/KoStreamedMath.h
A +74 -0 libs/pigment/compositeops/KoVcMultiArchBuildSupport.h [License: GPL (v2+)]
http://commits.kde.org/calligra/7cf94c1af73cb7bf56365f27a5b77df32d862778
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a439203..d2b489c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -324,8 +324,8 @@ if(HAVE_VC)
message(STATUS "Vc found!")
SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Vc_CMAKE_MODULES_DIR}")
- include (OptimizeForArchitecture)
- OptimizeForArchitecture()
+# include (OptimizeForArchitecture)
+# OptimizeForArchitecture()
endif(HAVE_VC)
set(CMAKE_MODULE_PATH ${OLD_CMAKE_MODULE_PATH} )
diff --git a/krita/benchmarks/kis_composition_benchmark.cpp b/krita/benchmarks/kis_composition_benchmark.cpp
index 9e3ef08..33b6427 100644
--- a/krita/benchmarks/kis_composition_benchmark.cpp
+++ b/krita/benchmarks/kis_composition_benchmark.cpp
@@ -363,10 +363,10 @@ void checkRounding()
quint8 *msk2 = tiles[1].mask;
for (int i = 0; i < numBlocks; i++) {
- Compositor::template compositeVector<true,true>(src1, dst1, msk1, 0.5, 0.3);
+ Compositor::template compositeVector<true,true, VC_IMPL>(src1, dst1, msk1, 0.5, 0.3);
for (int j = 0; j < vecSize; j++) {
- Compositor::template compositeOnePixelScalar<true>(src2, dst2, msk2, 0.5, 0.3, QBitArray());
+ Compositor::template compositeOnePixelScalar<true, VC_IMPL>(src2, dst2, msk2, 0.5, 0.3, QBitArray());
if(!comparePixels(dst1, dst2, 0)) {
qDebug() << "Wrong rounding in pixel:" << 8 * i + j;
diff --git a/libs/pigment/CMakeLists.txt b/libs/pigment/CMakeLists.txt
index 2eb7019f..68e7f5e 100644
--- a/libs/pigment/CMakeLists.txt
+++ b/libs/pigment/CMakeLists.txt
@@ -12,9 +12,60 @@ endif(OPENEXR_FOUND)
set(LINK_VC_LIB)
if(HAVE_SANE_VC)
- include_directories(${Vc_INCLUDE_DIR})
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Vc_DEFINITIONS}")
- set(LINK_VC_LIB ${Vc_LIBRARIES})
+ include_directories(${Vc_INCLUDE_DIR})
+
+ set(OLD_CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} )
+ SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Vc_CMAKE_MODULES_DIR}")
+ include (VcMacros)
+
+# This is a copy-paste from VcMacros.cmake
+# we need a version *without* Scalar implementation
+macro(ca_compile_for_all_implementations _objs _src)
+ set(${_objs})
+
+ # remove all -march, -msse, etc. flags from the flags we want to pass
+ string(REPLACE "${Vc_ARCHITECTURE_FLAGS}" "" _flags "${Vc_DEFINITIONS}")
+ string(REPLACE "-DVC_IMPL=[^ ]*" "" _flags "${_flags}")
+
+ # capture the -march= switch as -mtune; if there is none skip it
+ if(Vc_ARCHITECTURE_FLAGS MATCHES "-march=")
+ string(REGEX REPLACE "^.*-march=([^ ]*).*$" "-mtune=\\1" _tmp "${Vc_ARCHITECTURE_FLAGS}")
+ set(_flags "${_flags} ${_tmp}")
+ endif()
+
+ # make a semicolon separated list of all flags
+ string(TOUPPER "${CMAKE_BUILD_TYPE}" _tmp)
+ set(_tmp "CMAKE_CXX_FLAGS_${_tmp}")
+ string(REPLACE " " ";" _flags "${CMAKE_CXX_FLAGS} ${${_tmp}} ${_flags} ${ARGN}")
+ get_directory_property(_inc INCLUDE_DIRECTORIES)
+ foreach(_i ${_inc})
+ list(APPEND _flags "-I${_i}")
+ endforeach()
+
+ set(_vc_compile_src "${_src}")
+
+# commented out intentionally
+# _vc_compile_one_implementation(${_objs} Scalar NO_FLAG)
+ if(NOT Vc_SSE_INTRINSICS_BROKEN)
+ _vc_compile_one_implementation(${_objs} SSE2 "-msse2" "-xSSE2" "/arch:SSE2")
+ _vc_compile_one_implementation(${_objs} SSE3 "-msse3" "-xSSE3" "/arch:SSE2")
+ _vc_compile_one_implementation(${_objs} SSSE3 "-mssse3" "-xSSSE3" "/arch:SSE2")
+ _vc_compile_one_implementation(${_objs} SSE4_1 "-msse4.1" "-xSSE4.1" "/arch:SSE2")
+ _vc_compile_one_implementation(${_objs} SSE4_2 "-msse4.2" "-xSSE4.2" "/arch:SSE2")
+ _vc_compile_one_implementation(${_objs} SSE4a "-msse4a" "-xSSSE3" "/arch:SSE2")
+ endif()
+ if(NOT Vc_AVX_INTRINSICS_BROKEN)
+ _vc_compile_one_implementation(${_objs} AVX "-mavx" "-xAVX" "/arch:AVX")
+ endif()
+endmacro()
+
+ ca_compile_for_all_implementations(__per_arch_factory_objs compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp "-fPIC")
+ message("Following objects are generated from the per-arch lib")
+ message(${__per_arch_factory_objs})
+
+ set(CMAKE_MODULE_PATH ${OLD_CMAKE_MODULE_PATH} )
+
+ set(LINK_VC_LIB ${Vc_LIBRARIES})
endif(HAVE_SANE_VC)
add_subdirectory(tests)
@@ -53,6 +104,7 @@ set(pigmentcms_SRCS
colorspaces/KoSimpleColorSpaceEngine.cpp
compositeops/KoOptimizedCompositeOpFactory.cpp
compositeops/KoOptimizedCompositeOpFactory_p.cpp
+ ${__per_arch_factory_objs}
colorprofiles/KoDummyColorProfile.cpp
resources/KoAbstractGradient.cpp
resources/KoColorSet.cpp
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h b/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
index 57f03ef..b00a6db 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
@@ -21,7 +21,6 @@
#ifndef KOOPTIMIZEDCOMPOSITEOPALPHADARKEN32_H_
#define KOOPTIMIZEDCOMPOSITEOPALPHADARKEN32_H_
-#include "KoCompositeOpFunctions.h"
#include "KoCompositeOpBase.h"
#include "KoStreamedMath.h"
@@ -42,9 +41,7 @@ struct AlphaDarkenCompositor32 {
* o This function is *never* used if HAVE_SANE_VC is not present
*/
-#ifdef HAVE_SANE_VC
-
- template<bool haveMask, bool src_aligned>
+ template<bool haveMask, bool src_aligned, Vc::Implementation _impl>
static ALWAYS_INLINE void compositeVector(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, float flow)
{
Vc::float_v src_alpha;
@@ -61,16 +58,16 @@ struct AlphaDarkenCompositor32 {
Vc::float_v msk_norm_alpha;
- src_alpha = KoStreamedMath::fetch_alpha_32<src_aligned>(src);
+ src_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<src_aligned>(src);
if (haveMask) {
- Vc::float_v mask_vec = KoStreamedMath::fetch_mask_8(mask);
+ Vc::float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
msk_norm_alpha = src_alpha * mask_vec * uint8MaxRec2;
} else {
msk_norm_alpha = src_alpha * uint8MaxRec1;
}
- dst_alpha = KoStreamedMath::fetch_alpha_32<true>(dst);
+ dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<true>(dst);
src_alpha = msk_norm_alpha * opacity_vec;
Vc::float_m empty_dst_pixels_mask = dst_alpha == zeroValue;
@@ -83,7 +80,7 @@ struct AlphaDarkenCompositor32 {
Vc::float_v dst_c2;
Vc::float_v dst_c3;
- KoStreamedMath::fetch_colors_32<src_aligned>(src, src_c1, src_c2, src_c3);
+ KoStreamedMath<_impl>::template fetch_colors_32<src_aligned>(src, src_c1, src_c2, src_c3);
bool srcAlphaIsZero = (src_alpha == zeroValue).isFull();
if (srcAlphaIsZero) return;
@@ -109,12 +106,12 @@ struct AlphaDarkenCompositor32 {
dst_c3 = src_c3;
}
} else if (empty_dst_pixels_mask.isEmpty()) {
- KoStreamedMath::fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
+ KoStreamedMath<_impl>::template fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
dst_c1 = dst_blend * (src_c1 - dst_c1) + dst_c1;
dst_c2 = dst_blend * (src_c2 - dst_c2) + dst_c2;
dst_c3 = dst_blend * (src_c3 - dst_c3) + dst_c3;
} else {
- KoStreamedMath::fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
+ KoStreamedMath<_impl>::template fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
dst_c1(empty_dst_pixels_mask) = src_c1;
dst_c2(empty_dst_pixels_mask) = src_c2;
dst_c3(empty_dst_pixels_mask) = src_c3;
@@ -136,15 +133,13 @@ struct AlphaDarkenCompositor32 {
alpha2(alpha2_mask) = opt1;
dst_alpha = (alpha2 - alpha1) * flow_norm_vec + alpha1;
- KoStreamedMath::write_channels_32(dst, dst_alpha, dst_c1, dst_c2, dst_c3);
+ KoStreamedMath<_impl>::write_channels_32(dst, dst_alpha, dst_c1, dst_c2, dst_c3);
}
-#endif /* HAVE_SANE_VC */
-
/**
* Composes one pixel of the source into the destination
*/
- template <bool haveMask>
+ template <bool haveMask, Vc::Implementation _impl>
static ALWAYS_INLINE void compositeOnePixelScalar(const channels_type *src, channels_type *dst, const quint8 *mask, float opacity, float flow, const QBitArray &channelFlags)
{
Q_UNUSED(channelFlags);
@@ -176,9 +171,9 @@ struct AlphaDarkenCompositor32 {
}
if (dstAlphaInt != 0) {
- dst[0] = KoStreamedMath::lerp_mixed_u8_float(dst[0], src[0], srcAlphaNorm);
- dst[1] = KoStreamedMath::lerp_mixed_u8_float(dst[1], src[1], srcAlphaNorm);
- dst[2] = KoStreamedMath::lerp_mixed_u8_float(dst[2], src[2], srcAlphaNorm);
+ dst[0] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[0], src[0], srcAlphaNorm);
+ dst[1] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[1], src[1], srcAlphaNorm);
+ dst[2] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[2], src[2], srcAlphaNorm);
} else {
const pixel_type *s = reinterpret_cast<const pixel_type*>(src);
pixel_type *d = reinterpret_cast<pixel_type*>(dst);
@@ -196,6 +191,7 @@ struct AlphaDarkenCompositor32 {
* colorspaces with alpha channel placed at the last byte of
* the pixel: C1_C2_C3_A.
*/
+template<Vc::Implementation _impl>
class KoOptimizedCompositeOpAlphaDarken32 : public KoCompositeOp
{
public:
@@ -207,9 +203,9 @@ public:
virtual void composite(const KoCompositeOp::ParameterInfo& params) const
{
if(params.maskRowStart) {
- KoStreamedMath::genericComposite32<true, true, AlphaDarkenCompositor32<quint8, quint32> >(params);
+ KoStreamedMath<_impl>::template genericComposite32<true, true, AlphaDarkenCompositor32<quint8, quint32> >(params);
} else {
- KoStreamedMath::genericComposite32<false, true, AlphaDarkenCompositor32<quint8, quint32> >(params);
+ KoStreamedMath<_impl>::template genericComposite32<false, true, AlphaDarkenCompositor32<quint8, quint32> >(params);
}
}
};
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpFactory.cpp b/libs/pigment/compositeops/KoOptimizedCompositeOpFactory.cpp
index 8982378..0f37ccc 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpFactory.cpp
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpFactory.cpp
@@ -19,39 +19,53 @@
#include "KoOptimizedCompositeOpFactory.h"
#include "KoOptimizedCompositeOpFactory_p.h"
-/**
- * We include these headers even when no vectorization
- * is available on the system to ensure they build correctly
- */
-#include "KoOptimizedCompositeOpAlphaDarken32.h"
-#include "KoOptimizedCompositeOpOver32.h"
-
-
#include "config-vc.h"
-
#ifdef HAVE_SANE_VC
#include <Vc/global.h>
#include <Vc/common/support.h>
-#endif
+#include "KoOptimizedCompositeOpFactoryPerArch.h"
+
+static struct ArchReporter {
+ ArchReporter() {
+ KoOptimizedCompositeOpFactoryPerArchBase *factory =
+ createOptimizedCompositeOpFactory();
+ if (factory) {
+ factory->printArchInfo();
+ delete factory;
+ }
+ }
+} StaticReporter;
+
+
+#endif
KoCompositeOp* KoOptimizedCompositeOpFactory::createAlphaDarkenOp32(const KoColorSpace *cs)
{
#if defined HAVE_SANE_VC
- if (Vc::currentImplementationSupported()) {
- return new KoOptimizedCompositeOpAlphaDarken32(cs);
+ KoOptimizedCompositeOpFactoryPerArchBase *factory =
+ createOptimizedCompositeOpFactory();
+ if (factory) {
+ KoCompositeOp *op = factory->createAlphaDarkenOp32(cs);
+ delete factory;
+ return op;
}
#endif
+
return KoOptimizedCompositeOpFactoryPrivate::createLegacyAlphaDarkenOp32(cs);
}
KoCompositeOp* KoOptimizedCompositeOpFactory::createOverOp32(const KoColorSpace *cs)
{
#if defined HAVE_SANE_VC
- if (Vc::currentImplementationSupported()) {
- return new KoOptimizedCompositeOpOver32(cs);
+ KoOptimizedCompositeOpFactoryPerArchBase *factory =
+ createOptimizedCompositeOpFactory();
+ if (factory) {
+ KoCompositeOp *op = factory->createOverOp32(cs);
+ delete factory;
+ return op;
}
#endif
- return KoOptimizedCompositeOpFactoryPrivate::createLegacyOverOp32(cs);
+ return KoOptimizedCompositeOpFactoryPrivate::createLegacyOverOp32(cs);
}
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp
new file mode 100644
index 0000000..43ba698
--- /dev/null
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2012 Dmitry Kazakov <dimula73 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "KoOptimizedCompositeOpFactoryPerArch.h"
+
+#include <QDebug>
+
+#include "KoOptimizedCompositeOpAlphaDarken32.h"
+#include "KoOptimizedCompositeOpOver32.h"
+
+template<>
+KoCompositeOp* KoOptimizedCompositeOpFactoryPerArch<VC_IMPL>::createAlphaDarkenOp32(const KoColorSpace *cs)
+{
+ return new KoOptimizedCompositeOpAlphaDarken32<VC_IMPL>(cs);
+}
+
+template<>
+KoCompositeOp* KoOptimizedCompositeOpFactoryPerArch<VC_IMPL>::createOverOp32(const KoColorSpace *cs)
+{
+ return new KoOptimizedCompositeOpOver32<VC_IMPL>(cs);
+}
+
+#define __stringify(_s) #_s
+#define stringify(_s) __stringify(_s)
+
+#ifdef __SSE2__
+# define HAVE_SSE2 1
+#else
+# define HAVE_SSE2 0
+#endif
+
+#ifdef __SSE3__
+# define HAVE_SSE3 1
+#else
+# define HAVE_SSE3 0
+#endif
+
+#ifdef __SSSE3__
+# define HAVE_SSSE3 1
+#else
+# define HAVE_SSSE3 0
+#endif
+
+#ifdef __SSE4_1__
+# define HAVE_SSE4_1 1
+#else
+# define HAVE_SSE4_1 0
+#endif
+
+#ifdef __SSE4_2__
+# define HAVE_SSE4_2 1
+#else
+# define HAVE_SSE4_2 0
+#endif
+
+#ifdef __SSE4a__
+# define HAVE_SSE4a 1
+#else
+# define HAVE_SSE4a 0
+#endif
+
+#ifdef __AVX__
+# define HAVE_AVX 1
+#else
+# define HAVE_AVX 0
+#endif
+
+inline void printFeatureSupported(const QString &feature,
+ bool present)
+{
+ qDebug() << "\t" << feature << "\t---\t" << (present ? "yes" : "no");
+}
+
+template<>
+void KoOptimizedCompositeOpFactoryPerArch<VC_IMPL>::printArchInfo()
+{
+ qDebug() << "Compiled for arch:" << stringify(VC_IMPL);
+ qDebug() << "Features supported:";
+ printFeatureSupported("SSE2", HAVE_SSE2);
+ printFeatureSupported("SSE3", HAVE_SSE3);
+ printFeatureSupported("SSSE3", HAVE_SSSE3);
+ printFeatureSupported("SSE4.1", HAVE_SSE4_1);
+ printFeatureSupported("SSE4.2", HAVE_SSE4_2);
+ printFeatureSupported("SSE4a", HAVE_SSE4a);
+ printFeatureSupported("AVX ", HAVE_AVX);
+}
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h
new file mode 100644
index 0000000..3418a49
--- /dev/null
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2012 Dmitry Kazakov <dimula73 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef KOOPTIMIZEDCOMPOSITEOPFACTORYPERARCH_H
+#define KOOPTIMIZEDCOMPOSITEOPFACTORYPERARCH_H
+
+#include "config-vc.h"
+#ifndef HAVE_SANE_VC
+#error "BUG: There is no reason in including this file when Vc is not present"
+#endif
+
+#include "KoVcMultiArchBuildSupport.h"
+
+
+class KoCompositeOp;
+class KoColorSpace;
+
+struct KoOptimizedCompositeOpFactoryPerArchBase
+{
+ virtual ~KoOptimizedCompositeOpFactoryPerArchBase() {}
+ virtual KoCompositeOp* createAlphaDarkenOp32(const KoColorSpace *cs) = 0;
+ virtual KoCompositeOp* createOverOp32(const KoColorSpace *cs) = 0;
+ virtual void printArchInfo() = 0;
+};
+
+template<Vc::Implementation _impl>
+struct KoOptimizedCompositeOpFactoryPerArch : public KoOptimizedCompositeOpFactoryPerArchBase
+{
+ KoCompositeOp* createAlphaDarkenOp32(const KoColorSpace *cs);
+ KoCompositeOp* createOverOp32(const KoColorSpace *cs);
+ void printArchInfo();
+};
+
+#define DECLARE_FOR_ARCH(__arch) \
+ template<> KoCompositeOp* KoOptimizedCompositeOpFactoryPerArch<__arch>::createAlphaDarkenOp32(const KoColorSpace *cs); \
+ template<> KoCompositeOp* KoOptimizedCompositeOpFactoryPerArch<__arch>::createOverOp32(const KoColorSpace *cs); \
+ template<> void KoOptimizedCompositeOpFactoryPerArch<__arch>::printArchInfo();
+
+DECLARE_FOR_ALL_ARCHS_NO_SCALAR(DECLARE_FOR_ARCH);
+#define createOptimizedCompositeOpFactory createOptimizedFactoryNoScalar<KoOptimizedCompositeOpFactoryPerArch, KoOptimizedCompositeOpFactoryPerArchBase>
+
+
+#endif /* KOOPTIMIZEDCOMPOSITEOPFACTORYPERARCH_H */
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h b/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
index 8d16f3e..5790cf5 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
@@ -21,7 +21,6 @@
#ifndef KOOPTIMIZEDCOMPOSITEOPOVER32_H_
#define KOOPTIMIZEDCOMPOSITEOPOVER32_H_
-#include "KoCompositeOpFunctions.h"
#include "KoCompositeOpBase.h"
#include "KoStreamedMath.h"
@@ -31,9 +30,7 @@ template<typename channels_type, typename pixel_type, bool alphaLocked, bool all
struct OverCompositor32 {
// \see docs in AlphaDarkenCompositor32
-#ifdef HAVE_SANE_VC
-
- template<bool haveMask, bool src_aligned>
+ template<bool haveMask, bool src_aligned, Vc::Implementation _impl>
static ALWAYS_INLINE void compositeVector(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, float flow)
{
Q_UNUSED(flow);
@@ -41,7 +38,7 @@ struct OverCompositor32 {
Vc::float_v src_alpha;
Vc::float_v dst_alpha;
- src_alpha = KoStreamedMath::fetch_alpha_32<src_aligned>(src);
+ src_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<src_aligned>(src);
bool haveOpacity = opacity != 1.0;
Vc::float_v opacity_norm_vec(opacity);
@@ -54,7 +51,7 @@ struct OverCompositor32 {
src_alpha *= opacity_norm_vec;
if (haveMask) {
- Vc::float_v mask_vec = KoStreamedMath::fetch_mask_8(mask);
+ Vc::float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
src_alpha *= mask_vec * uint8MaxRec1;
}
@@ -64,7 +61,7 @@ struct OverCompositor32 {
return;
}
- dst_alpha = KoStreamedMath::fetch_alpha_32<true>(dst);
+ dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<true>(dst);
Vc::float_v src_c1;
Vc::float_v src_c2;
@@ -75,7 +72,7 @@ struct OverCompositor32 {
Vc::float_v dst_c3;
- KoStreamedMath::fetch_colors_32<src_aligned>(src, src_c1, src_c2, src_c3);
+ KoStreamedMath<_impl>::template fetch_colors_32<src_aligned>(src, src_c1, src_c2, src_c3);
Vc::float_v src_blend;
Vc::float_v new_alpha;
@@ -97,7 +94,7 @@ struct OverCompositor32 {
}
if (!(src_blend == oneValue).isFull()) {
- KoStreamedMath::fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
+ KoStreamedMath<_impl>::template fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
dst_c1 = src_blend * (src_c1 - dst_c1) + dst_c1;
dst_c2 = src_blend * (src_c2 - dst_c2) + dst_c2;
@@ -116,12 +113,10 @@ struct OverCompositor32 {
}
}
- KoStreamedMath::write_channels_32(dst, new_alpha, dst_c1, dst_c2, dst_c3);
+ KoStreamedMath<_impl>::write_channels_32(dst, new_alpha, dst_c1, dst_c2, dst_c3);
}
-#endif /* HAVE_SANE_VC */
-
- template <bool haveMask>
+ template <bool haveMask, Vc::Implementation _impl>
static ALWAYS_INLINE void compositeOnePixelScalar(const channels_type *src, channels_type *dst, const quint8 *mask, float opacity, float flow, const QBitArray &channelFlags)
{
Q_UNUSED(flow);
@@ -165,9 +160,9 @@ struct OverCompositor32 {
pixel_type *d = reinterpret_cast<pixel_type*>(dst);
*d = *s;
} else if (srcBlendNorm != 0.0){
- dst[0] = KoStreamedMath::lerp_mixed_u8_float(dst[0], src[0], srcBlendNorm);
- dst[1] = KoStreamedMath::lerp_mixed_u8_float(dst[1], src[1], srcBlendNorm);
- dst[2] = KoStreamedMath::lerp_mixed_u8_float(dst[2], src[2], srcBlendNorm);
+ dst[0] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[0], src[0], srcBlendNorm);
+ dst[1] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[1], src[1], srcBlendNorm);
+ dst[2] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[2], src[2], srcBlendNorm);
}
} else {
if (srcBlendNorm == 1.0) {
@@ -175,9 +170,9 @@ struct OverCompositor32 {
if(channelFlags.at(1)) dst[1] = src[1];
if(channelFlags.at(2)) dst[2] = src[2];
} else if (srcBlendNorm != 0.0) {
- if(channelFlags.at(0)) dst[0] = KoStreamedMath::lerp_mixed_u8_float(dst[0], src[0], srcBlendNorm);
- if(channelFlags.at(1)) dst[1] = KoStreamedMath::lerp_mixed_u8_float(dst[1], src[1], srcBlendNorm);
- if(channelFlags.at(2)) dst[2] = KoStreamedMath::lerp_mixed_u8_float(dst[2], src[2], srcBlendNorm);
+ if(channelFlags.at(0)) dst[0] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[0], src[0], srcBlendNorm);
+ if(channelFlags.at(1)) dst[1] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[1], src[1], srcBlendNorm);
+ if(channelFlags.at(2)) dst[2] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[2], src[2], srcBlendNorm);
}
}
@@ -193,6 +188,7 @@ struct OverCompositor32 {
* colorspaces with alpha channel placed at the last byte of
* the pixel: C1_C2_C3_A.
*/
+template<Vc::Implementation _impl>
class KoOptimizedCompositeOpOver32 : public KoCompositeOp
{
public:
@@ -215,7 +211,7 @@ public:
if (params.channelFlags.isEmpty() ||
params.channelFlags == QBitArray(4, true)) {
- KoStreamedMath::genericComposite32<haveMask, false, OverCompositor32<quint8, quint32, false, true> >(params);
+ KoStreamedMath<_impl>::template genericComposite32<haveMask, false, OverCompositor32<quint8, quint32, false, true> >(params);
} else {
const bool allChannelsFlag =
params.channelFlags.at(0) &&
@@ -226,11 +222,11 @@ public:
!params.channelFlags.at(3);
if (allChannelsFlag && alphaLocked) {
- KoStreamedMath::genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, true, true> >(params);
+ KoStreamedMath<_impl>::template genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, true, true> >(params);
} else if (!allChannelsFlag && !alphaLocked) {
- KoStreamedMath::genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, false, false> >(params);
+ KoStreamedMath<_impl>::template genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, false, false> >(params);
} else /*if (!allChannelsFlag && alphaLocked) */{
- KoStreamedMath::genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, true, false> >(params);
+ KoStreamedMath<_impl>::template genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, true, false> >(params);
}
}
}
diff --git a/libs/pigment/compositeops/KoStreamedMath.h b/libs/pigment/compositeops/KoStreamedMath.h
index 9d2dfa8..3f2594c 100644
--- a/libs/pigment/compositeops/KoStreamedMath.h
+++ b/libs/pigment/compositeops/KoStreamedMath.h
@@ -21,11 +21,12 @@
#include "config-vc.h"
+#ifndef HAVE_SANE_VC
+#error "BUG: There is no reason in including this file when Vc is not present"
+#endif
-#ifdef HAVE_SANE_VC
#include <Vc/Vc>
#include <Vc/IO>
-#endif
#include <stdint.h>
@@ -39,13 +40,14 @@
#endif
#endif
-namespace KoStreamedMath {
+template<Vc::Implementation _impl>
+struct KoStreamedMath {
/**
* Composes src into dst without using vector instructions
*/
template<bool useMask, bool useFlow, class Compositor>
- void genericComposite32_novector(const KoCompositeOp::ParameterInfo& params)
+ static void genericComposite32_novector(const KoCompositeOp::ParameterInfo& params)
{
using namespace Arithmetic;
@@ -64,7 +66,7 @@ template<bool useMask, bool useFlow, class Compositor>
int blockRest = params.cols;
for(int i = 0; i < blockRest; i++) {
- Compositor::template compositeOnePixelScalar<useMask>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
+ Compositor::template compositeOnePixelScalar<useMask, _impl>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
src += srcLinearInc;
dst += linearInc;
@@ -86,13 +88,11 @@ static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha) {
return quint8(qint16(b - a) * alpha + a);
}
-#if defined HAVE_SANE_VC
-
/**
* Get a vector containing first Vc::float_v::Size values of mask.
* Each source mask element is considered to be a 8-bit integer
*/
-inline Vc::float_v fetch_mask_8(const quint8 *data) {
+static inline Vc::float_v fetch_mask_8(const quint8 *data) {
Vc::uint_v data_i(data);
return Vc::float_v(Vc::int_v(data_i));
}
@@ -110,7 +110,7 @@ inline Vc::float_v fetch_mask_8(const quint8 *data) {
* causes #GP (General Protection Exception)
*/
template <bool aligned>
-inline Vc::float_v fetch_alpha_32(const quint8 *data) {
+static inline Vc::float_v fetch_alpha_32(const quint8 *data) {
Vc::uint_v data_i;
if (aligned) {
data_i.load((const quint32*)data, Vc::Aligned);
@@ -134,7 +134,7 @@ inline Vc::float_v fetch_alpha_32(const quint8 *data) {
* causes #GP (General Protection Exception)
*/
template <bool aligned>
-inline void fetch_colors_32(const quint8 *data,
+static inline void fetch_colors_32(const quint8 *data,
Vc::float_v &c1,
Vc::float_v &c2,
Vc::float_v &c3) {
@@ -161,7 +161,7 @@ inline void fetch_colors_32(const quint8 *data,
*
* NOTE: \p data must be aligned pointer!
*/
-inline void write_channels_32(quint8 *data,
+static inline void write_channels_32(quint8 *data,
Vc::float_v alpha,
Vc::float_v c1,
Vc::float_v c2,
@@ -191,7 +191,7 @@ inline void write_channels_32(quint8 *data,
* math of the composition
*/
template<bool useMask, bool useFlow, class Compositor>
- void genericComposite32(const KoCompositeOp::ParameterInfo& params)
+ static void genericComposite32(const KoCompositeOp::ParameterInfo& params)
{
using namespace Arithmetic;
@@ -251,7 +251,7 @@ template<bool useMask, bool useFlow, class Compositor>
}
for(int i = 0; i < blockAlign; i++) {
- Compositor::template compositeOnePixelScalar<useMask>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
+ Compositor::template compositeOnePixelScalar<useMask, _impl>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
src += srcLinearInc;
dst += linearInc;
@@ -261,7 +261,7 @@ template<bool useMask, bool useFlow, class Compositor>
}
for (int i = 0; i < blockAlignedVector; i++) {
- Compositor::template compositeVector<useMask, true>(src, dst, mask, params.opacity, params.flow);
+ Compositor::template compositeVector<useMask, true, _impl>(src, dst, mask, params.opacity, params.flow);
src += srcVectorInc;
dst += vectorInc;
@@ -271,7 +271,7 @@ template<bool useMask, bool useFlow, class Compositor>
}
for (int i = 0; i < blockUnalignedVector; i++) {
- Compositor::template compositeVector<useMask, false>(src, dst, mask, params.opacity, params.flow);
+ Compositor::template compositeVector<useMask, false, _impl>(src, dst, mask, params.opacity, params.flow);
src += srcVectorInc;
dst += vectorInc;
@@ -282,7 +282,7 @@ template<bool useMask, bool useFlow, class Compositor>
for(int i = 0; i < blockRest; i++) {
- Compositor::template compositeOnePixelScalar<useMask>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
+ Compositor::template compositeOnePixelScalar<useMask, _impl>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
src += srcLinearInc;
dst += linearInc;
@@ -304,24 +304,6 @@ template<bool useMask, bool useFlow, class Compositor>
}
}
-#else /* if ! defined HAVE_SANE_VC */
-
-/**
- * Fall back to the scalar version of the composition.
- *
- * Don't use this method! The scalar floating point version of the
- * algorithm is up to 2 times slower then the basic integer
- * implementation! Use another composite op instead!
- */
-
-template<bool useMask, bool useFlow, class Compositor>
- void genericComposite32(const KoCompositeOp::ParameterInfo& params)
-{
- genericComposite32_novector<useMask, useFlow, Compositor>(params);
-}
-
-#endif /* HAVE_SANE_VC */
-
-}
+};
#endif /* __VECTOR_MATH_H */
diff --git a/libs/pigment/compositeops/KoVcMultiArchBuildSupport.h b/libs/pigment/compositeops/KoVcMultiArchBuildSupport.h
new file mode 100644
index 0000000..28d7717
--- /dev/null
+++ b/libs/pigment/compositeops/KoVcMultiArchBuildSupport.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2012 Dmitry Kazakov <dimula73 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __KOVCMULTIARCHBUILDSUPPORT_H
+#define __KOVCMULTIARCHBUILDSUPPORT_H
+
+#include <Vc/Vc>
+#include <Vc/common/support.h>
+
+template<template<Vc::Implementation _impl> class FactoryType, class ReturnType>
+ ReturnType* createOptimizedFactoryNoScalar()
+{
+ /*if (Vc::isImplementationSupported(Vc::Fma4Impl)) {
+ return new FactoryType<Vc::Fma4Impl>();
+ } else if (Vc::isImplementationSupported(Vc::XopImpl)) {
+ return new FactoryType<Vc::XopImpl>();
+ } else*/
+ if (Vc::isImplementationSupported(Vc::AVXImpl)) {
+ return new FactoryType<Vc::AVXImpl>();
+ } else if (Vc::isImplementationSupported(Vc::SSE42Impl)) {
+ return new FactoryType<Vc::SSE42Impl>();
+ } else if (Vc::isImplementationSupported(Vc::SSE41Impl)) {
+ return new FactoryType<Vc::SSE41Impl>();
+ } else if (Vc::isImplementationSupported(Vc::SSE4aImpl)) {
+ return new FactoryType<Vc::SSE4aImpl>();
+ } else if (Vc::isImplementationSupported(Vc::SSSE3Impl)) {
+ return new FactoryType<Vc::SSSE3Impl>();
+ } else if (Vc::isImplementationSupported(Vc::SSE3Impl)) {
+ return new FactoryType<Vc::SSE3Impl>();
+ } else if (Vc::isImplementationSupported(Vc::SSE2Impl)) {
+ return new FactoryType<Vc::SSE2Impl>();
+ }
+
+ return 0;
+}
+
+template<template<Vc::Implementation _impl> class FactoryType, class ReturnType>
+ ReturnType* createOptimizedFactory()
+{
+ ReturnType *f = createOptimizedFactoryNoScalar<FactoryType, ReturnType>();
+ return f ? f : new FactoryType<Vc::ScalarImpl>();
+}
+
+#define DECLARE_FOR_ALL_ARCHS_NO_SCALAR(_DECL) \
+ _DECL(Vc::SSE2Impl); \
+ _DECL(Vc::SSE3Impl); \
+ _DECL(Vc::SSSE3Impl); \
+ _DECL(Vc::SSE41Impl); \
+ _DECL(Vc::SSE42Impl); \
+ _DECL(Vc::SSE4aImpl); \
+ _DECL(Vc::AVXImpl);/* \
+ _DECL(Vc::XopImpl); \
+ _DECL(Vc::Fma4Impl);*/
+
+#define DECLARE_FOR_ALL_ARCHS(_DECL) \
+ DECLARE_FOR_ALL_ARCHS_NO_SCALAR(_DECL); \
+ _DECL(Vc::ScalarImpl);
+
+#endif /* __KOVCMULTIARCHBUILDSUPPORT_H */
More information about the kimageshop
mailing list