[calligra/vector_compositioning_kazakov] /: Added the first version of per-architecture binaries for composition

Sun Dec 2 15:46:14 UTC 2012

Git commit 7cf94c1af73cb7bf56365f27a5b77df32d862778 by Dmitry Kazakov.
Committed on 02/12/2012 at 16:44.
Pushed by dkazakov into branch 'vector_compositioning_kazakov'.

Added the first version of per-architecture binaries for composition

Pros:
+ we can have prebuild versions for all the architectures supported
  by Vc (Amd XMA4 and XOP are not supported by Vc yet)
+ the implementation is chosen dynamically on Krita start
+ the semi-general code for multi-arch builds now in
  KoVcMultiArchBuildSupport.h (might be ported upstream in the future)

Cons:
- it depends on Vc's 'staging' branch, so it can't be put in master
  right now
- the code became much less readable due to all that template magic
- I had to copy-paste Vc's 'vc_compile_for_all_implementations' cmake
  macro, because we do not need 'Scalar' implementation
- the size of the pigment library grew almost 1.5 times: 11->17 MiB
  (probably, we still need plugin system for this)

CCMAIL:kimageshop at kde.org
CCMAIL:kretz at kde.org

M  +2    -2    CMakeLists.txt
M  +2    -2    krita/benchmarks/kis_composition_benchmark.cpp
M  +55   -3    libs/pigment/CMakeLists.txt
M  +15   -19   libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
M  +29   -15   libs/pigment/compositeops/KoOptimizedCompositeOpFactory.cpp
A  +101  -0    libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp     [License: GPL (v2+)]
A  +58   -0    libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h     [License: GPL (v2+)]
M  +19   -23   libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
M  +17   -35   libs/pigment/compositeops/KoStreamedMath.h
A  +74   -0    libs/pigment/compositeops/KoVcMultiArchBuildSupport.h     [License: GPL (v2+)]

http://commits.kde.org/calligra/7cf94c1af73cb7bf56365f27a5b77df32d862778

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a439203..d2b489c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -324,8 +324,8 @@ if(HAVE_VC)
     message(STATUS "Vc found!")
 
     SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Vc_CMAKE_MODULES_DIR}")
-    include (OptimizeForArchitecture)
-    OptimizeForArchitecture()
+#    include (OptimizeForArchitecture)
+#    OptimizeForArchitecture()
 endif(HAVE_VC)
 set(CMAKE_MODULE_PATH ${OLD_CMAKE_MODULE_PATH} )
 
diff --git a/krita/benchmarks/kis_composition_benchmark.cpp b/krita/benchmarks/kis_composition_benchmark.cpp
index 9e3ef08..33b6427 100644
--- a/krita/benchmarks/kis_composition_benchmark.cpp
+++ b/krita/benchmarks/kis_composition_benchmark.cpp
@@ -363,10 +363,10 @@ void checkRounding()
     quint8 *msk2 = tiles[1].mask;
 
     for (int i = 0; i < numBlocks; i++) {
-        Compositor::template compositeVector<true,true>(src1, dst1, msk1, 0.5, 0.3);
+        Compositor::template compositeVector<true,true, VC_IMPL>(src1, dst1, msk1, 0.5, 0.3);
         for (int j = 0; j < vecSize; j++) {
 
-            Compositor::template compositeOnePixelScalar<true>(src2, dst2, msk2, 0.5, 0.3, QBitArray());
+            Compositor::template compositeOnePixelScalar<true, VC_IMPL>(src2, dst2, msk2, 0.5, 0.3, QBitArray());
 
             if(!comparePixels(dst1, dst2, 0)) {
                 qDebug() << "Wrong rounding in pixel:" << 8 * i + j;
diff --git a/libs/pigment/CMakeLists.txt b/libs/pigment/CMakeLists.txt
index 2eb7019f..68e7f5e 100644
--- a/libs/pigment/CMakeLists.txt
+++ b/libs/pigment/CMakeLists.txt
@@ -12,9 +12,60 @@ endif(OPENEXR_FOUND)
 
 set(LINK_VC_LIB)
 if(HAVE_SANE_VC)
-  include_directories(${Vc_INCLUDE_DIR})
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Vc_DEFINITIONS}")
-  set(LINK_VC_LIB ${Vc_LIBRARIES})
+    include_directories(${Vc_INCLUDE_DIR})
+
+    set(OLD_CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} )
+    SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Vc_CMAKE_MODULES_DIR}")
+    include (VcMacros)
+
+# This is a copy-paste from VcMacros.cmake
+# we need a version *without* Scalar implementation
+macro(ca_compile_for_all_implementations _objs _src)
+   set(${_objs})
+
+   # remove all -march, -msse, etc. flags from the flags we want to pass
+   string(REPLACE "${Vc_ARCHITECTURE_FLAGS}" "" _flags "${Vc_DEFINITIONS}")
+   string(REPLACE "-DVC_IMPL=[^ ]*" "" _flags "${_flags}")
+
+   # capture the -march= switch as -mtune; if there is none skip it
+   if(Vc_ARCHITECTURE_FLAGS MATCHES "-march=")
+      string(REGEX REPLACE "^.*-march=([^ ]*).*$" "-mtune=\\1" _tmp "${Vc_ARCHITECTURE_FLAGS}")
+      set(_flags "${_flags} ${_tmp}")
+   endif()
+
+   # make a semicolon separated list of all flags
+   string(TOUPPER "${CMAKE_BUILD_TYPE}" _tmp)
+   set(_tmp "CMAKE_CXX_FLAGS_${_tmp}")
+   string(REPLACE " " ";" _flags "${CMAKE_CXX_FLAGS} ${${_tmp}} ${_flags} ${ARGN}")
+   get_directory_property(_inc INCLUDE_DIRECTORIES)
+   foreach(_i ${_inc})
+      list(APPEND _flags "-I${_i}")
+   endforeach()
+
+   set(_vc_compile_src "${_src}")
+
+# commented out intentionally
+#   _vc_compile_one_implementation(${_objs} Scalar NO_FLAG)
+   if(NOT Vc_SSE_INTRINSICS_BROKEN)
+      _vc_compile_one_implementation(${_objs} SSE2   "-msse2"   "-xSSE2"   "/arch:SSE2")
+      _vc_compile_one_implementation(${_objs} SSE3   "-msse3"   "-xSSE3"   "/arch:SSE2")
+      _vc_compile_one_implementation(${_objs} SSSE3  "-mssse3"  "-xSSSE3"  "/arch:SSE2")
+      _vc_compile_one_implementation(${_objs} SSE4_1 "-msse4.1" "-xSSE4.1" "/arch:SSE2")
+      _vc_compile_one_implementation(${_objs} SSE4_2 "-msse4.2" "-xSSE4.2" "/arch:SSE2")
+      _vc_compile_one_implementation(${_objs} SSE4a  "-msse4a"  "-xSSSE3"  "/arch:SSE2")
+   endif()
+   if(NOT Vc_AVX_INTRINSICS_BROKEN)
+      _vc_compile_one_implementation(${_objs} AVX      "-mavx"    "-xAVX"    "/arch:AVX")
+   endif()
+endmacro()
+
+    ca_compile_for_all_implementations(__per_arch_factory_objs compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp "-fPIC")
+    message("Following objects are generated from the per-arch lib")
+    message(${__per_arch_factory_objs})
+
+    set(CMAKE_MODULE_PATH ${OLD_CMAKE_MODULE_PATH} )
+
+    set(LINK_VC_LIB ${Vc_LIBRARIES})
 endif(HAVE_SANE_VC)
 
 add_subdirectory(tests)
@@ -53,6 +104,7 @@ set(pigmentcms_SRCS
     colorspaces/KoSimpleColorSpaceEngine.cpp
     compositeops/KoOptimizedCompositeOpFactory.cpp
     compositeops/KoOptimizedCompositeOpFactory_p.cpp
+    ${__per_arch_factory_objs}
     colorprofiles/KoDummyColorProfile.cpp
     resources/KoAbstractGradient.cpp
     resources/KoColorSet.cpp
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h b/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
index 57f03ef..b00a6db 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
@@ -21,7 +21,6 @@
 #ifndef KOOPTIMIZEDCOMPOSITEOPALPHADARKEN32_H_
 #define KOOPTIMIZEDCOMPOSITEOPALPHADARKEN32_H_
 
-#include "KoCompositeOpFunctions.h"
 #include "KoCompositeOpBase.h"
 
 #include "KoStreamedMath.h"
@@ -42,9 +41,7 @@ struct AlphaDarkenCompositor32 {
      * o This function is *never* used if HAVE_SANE_VC is not present
      */
 
-#ifdef HAVE_SANE_VC
-
-    template<bool haveMask, bool src_aligned>
+    template<bool haveMask, bool src_aligned, Vc::Implementation _impl>
     static ALWAYS_INLINE void compositeVector(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, float flow)
     {
         Vc::float_v src_alpha;
@@ -61,16 +58,16 @@ struct AlphaDarkenCompositor32 {
 
 
         Vc::float_v msk_norm_alpha;
-        src_alpha = KoStreamedMath::fetch_alpha_32<src_aligned>(src);
+        src_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<src_aligned>(src);
 
         if (haveMask) {
-            Vc::float_v mask_vec = KoStreamedMath::fetch_mask_8(mask);
+            Vc::float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
             msk_norm_alpha = src_alpha * mask_vec * uint8MaxRec2;
         } else {
             msk_norm_alpha = src_alpha * uint8MaxRec1;
         }
 
-        dst_alpha = KoStreamedMath::fetch_alpha_32<true>(dst);
+        dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<true>(dst);
         src_alpha = msk_norm_alpha * opacity_vec;
 
         Vc::float_m empty_dst_pixels_mask = dst_alpha == zeroValue;
@@ -83,7 +80,7 @@ struct AlphaDarkenCompositor32 {
         Vc::float_v dst_c2;
         Vc::float_v dst_c3;
 
-        KoStreamedMath::fetch_colors_32<src_aligned>(src, src_c1, src_c2, src_c3);
+        KoStreamedMath<_impl>::template fetch_colors_32<src_aligned>(src, src_c1, src_c2, src_c3);
 
         bool srcAlphaIsZero = (src_alpha == zeroValue).isFull();
         if (srcAlphaIsZero) return;
@@ -109,12 +106,12 @@ struct AlphaDarkenCompositor32 {
                 dst_c3 = src_c3;
             }
         } else if (empty_dst_pixels_mask.isEmpty()) {
-            KoStreamedMath::fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
+            KoStreamedMath<_impl>::template fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
             dst_c1 = dst_blend * (src_c1 - dst_c1) + dst_c1;
             dst_c2 = dst_blend * (src_c2 - dst_c2) + dst_c2;
             dst_c3 = dst_blend * (src_c3 - dst_c3) + dst_c3;
         } else {
-            KoStreamedMath::fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
+            KoStreamedMath<_impl>::template fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
             dst_c1(empty_dst_pixels_mask) = src_c1;
             dst_c2(empty_dst_pixels_mask) = src_c2;
             dst_c3(empty_dst_pixels_mask) = src_c3;
@@ -136,15 +133,13 @@ struct AlphaDarkenCompositor32 {
         alpha2(alpha2_mask) = opt1;
         dst_alpha = (alpha2 - alpha1) * flow_norm_vec + alpha1;
 
-        KoStreamedMath::write_channels_32(dst, dst_alpha, dst_c1, dst_c2, dst_c3);
+        KoStreamedMath<_impl>::write_channels_32(dst, dst_alpha, dst_c1, dst_c2, dst_c3);
     }
 
-#endif /* HAVE_SANE_VC */
-
     /**
      * Composes one pixel of the source into the destination
      */
-    template <bool haveMask>
+    template <bool haveMask, Vc::Implementation _impl>
     static ALWAYS_INLINE void compositeOnePixelScalar(const channels_type *src, channels_type *dst, const quint8 *mask, float opacity, float flow, const QBitArray &channelFlags)
     {
         Q_UNUSED(channelFlags);
@@ -176,9 +171,9 @@ struct AlphaDarkenCompositor32 {
         }
 
         if (dstAlphaInt != 0) {
-            dst[0] = KoStreamedMath::lerp_mixed_u8_float(dst[0], src[0], srcAlphaNorm);
-            dst[1] = KoStreamedMath::lerp_mixed_u8_float(dst[1], src[1], srcAlphaNorm);
-            dst[2] = KoStreamedMath::lerp_mixed_u8_float(dst[2], src[2], srcAlphaNorm);
+            dst[0] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[0], src[0], srcAlphaNorm);
+            dst[1] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[1], src[1], srcAlphaNorm);
+            dst[2] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[2], src[2], srcAlphaNorm);
         } else {
             const pixel_type *s = reinterpret_cast<const pixel_type*>(src);
             pixel_type *d = reinterpret_cast<pixel_type*>(dst);
@@ -196,6 +191,7 @@ struct AlphaDarkenCompositor32 {
  * colorspaces with alpha channel placed at the last byte of
  * the pixel: C1_C2_C3_A.
  */
+template<Vc::Implementation _impl>
 class KoOptimizedCompositeOpAlphaDarken32 : public KoCompositeOp
 {
 public:
@@ -207,9 +203,9 @@ public:
     virtual void composite(const KoCompositeOp::ParameterInfo& params) const
     {
         if(params.maskRowStart) {
-            KoStreamedMath::genericComposite32<true, true, AlphaDarkenCompositor32<quint8, quint32> >(params);
+            KoStreamedMath<_impl>::template genericComposite32<true, true, AlphaDarkenCompositor32<quint8, quint32> >(params);
         } else {
-            KoStreamedMath::genericComposite32<false, true, AlphaDarkenCompositor32<quint8, quint32> >(params);
+            KoStreamedMath<_impl>::template genericComposite32<false, true, AlphaDarkenCompositor32<quint8, quint32> >(params);
         }
     }
 };
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpFactory.cpp b/libs/pigment/compositeops/KoOptimizedCompositeOpFactory.cpp
index 8982378..0f37ccc 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpFactory.cpp
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpFactory.cpp
@@ -19,39 +19,53 @@
 #include "KoOptimizedCompositeOpFactory.h"
 #include "KoOptimizedCompositeOpFactory_p.h"
 
-/**
- * We include these headers even when no vectorization
- * is available on the system to ensure they build correctly
- */
-#include "KoOptimizedCompositeOpAlphaDarken32.h"
-#include "KoOptimizedCompositeOpOver32.h"
-
-
 #include "config-vc.h"
-
 #ifdef HAVE_SANE_VC
 #include <Vc/global.h>
 #include <Vc/common/support.h>
-#endif
+#include "KoOptimizedCompositeOpFactoryPerArch.h"
+
 
+static struct ArchReporter {
+    ArchReporter() {
+        KoOptimizedCompositeOpFactoryPerArchBase *factory =
+            createOptimizedCompositeOpFactory();
+        if (factory) {
+            factory->printArchInfo();
+            delete factory;
+        }
+    }
+} StaticReporter;
+
+
+#endif
 
 KoCompositeOp* KoOptimizedCompositeOpFactory::createAlphaDarkenOp32(const KoColorSpace *cs)
 {
 #if defined HAVE_SANE_VC
-    if (Vc::currentImplementationSupported()) {
-        return new KoOptimizedCompositeOpAlphaDarken32(cs);
+    KoOptimizedCompositeOpFactoryPerArchBase *factory =
+        createOptimizedCompositeOpFactory();
+    if (factory) {
+        KoCompositeOp *op = factory->createAlphaDarkenOp32(cs);
+        delete factory;
+        return op;
     }
 #endif
+
     return KoOptimizedCompositeOpFactoryPrivate::createLegacyAlphaDarkenOp32(cs);
 }
 
 KoCompositeOp* KoOptimizedCompositeOpFactory::createOverOp32(const KoColorSpace *cs)
 {
 #if defined HAVE_SANE_VC
-    if (Vc::currentImplementationSupported()) {
-        return new KoOptimizedCompositeOpOver32(cs);
+    KoOptimizedCompositeOpFactoryPerArchBase *factory =
+        createOptimizedCompositeOpFactory();
+    if (factory) {
+        KoCompositeOp *op = factory->createOverOp32(cs);
+        delete factory;
+        return op;
     }
 #endif
-    return KoOptimizedCompositeOpFactoryPrivate::createLegacyOverOp32(cs);
 
+    return KoOptimizedCompositeOpFactoryPrivate::createLegacyOverOp32(cs);
 }
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp
new file mode 100644
index 0000000..43ba698
--- /dev/null
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2012 Dmitry Kazakov <dimula73 at gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "KoOptimizedCompositeOpFactoryPerArch.h"
+
+#include <QDebug>
+
+#include "KoOptimizedCompositeOpAlphaDarken32.h"
+#include "KoOptimizedCompositeOpOver32.h"
+
+template<>
+KoCompositeOp* KoOptimizedCompositeOpFactoryPerArch<VC_IMPL>::createAlphaDarkenOp32(const KoColorSpace *cs)
+{
+    return new KoOptimizedCompositeOpAlphaDarken32<VC_IMPL>(cs);
+}
+
+template<>
+KoCompositeOp* KoOptimizedCompositeOpFactoryPerArch<VC_IMPL>::createOverOp32(const KoColorSpace *cs)
+{
+    return new KoOptimizedCompositeOpOver32<VC_IMPL>(cs);
+}
+
+#define __stringify(_s) #_s
+#define stringify(_s) __stringify(_s)
+
+#ifdef __SSE2__
+#  define HAVE_SSE2 1
+#else
+#  define HAVE_SSE2 0
+#endif
+
+#ifdef __SSE3__
+#  define HAVE_SSE3 1
+#else
+#  define HAVE_SSE3 0
+#endif
+
+#ifdef __SSSE3__
+#  define HAVE_SSSE3 1
+#else
+#  define HAVE_SSSE3 0
+#endif
+
+#ifdef __SSE4_1__
+#  define HAVE_SSE4_1 1
+#else
+#  define HAVE_SSE4_1 0
+#endif
+
+#ifdef __SSE4_2__
+#  define HAVE_SSE4_2 1
+#else
+#  define HAVE_SSE4_2 0
+#endif
+
+#ifdef __SSE4a__
+#  define HAVE_SSE4a 1
+#else
+#  define HAVE_SSE4a 0
+#endif
+
+#ifdef __AVX__
+#  define HAVE_AVX 1
+#else
+#  define HAVE_AVX 0
+#endif
+
+inline void printFeatureSupported(const QString &feature,
+                                  bool present)
+{
+    qDebug() << "\t" << feature << "\t---\t" << (present ? "yes" : "no");
+}
+
+template<>
+void KoOptimizedCompositeOpFactoryPerArch<VC_IMPL>::printArchInfo()
+{
+    qDebug() << "Compiled for arch:" << stringify(VC_IMPL);
+    qDebug() << "Features supported:";
+    printFeatureSupported("SSE2", HAVE_SSE2);
+    printFeatureSupported("SSE3", HAVE_SSE3);
+    printFeatureSupported("SSSE3", HAVE_SSSE3);
+    printFeatureSupported("SSE4.1", HAVE_SSE4_1);
+    printFeatureSupported("SSE4.2", HAVE_SSE4_2);
+    printFeatureSupported("SSE4a", HAVE_SSE4a);
+    printFeatureSupported("AVX ", HAVE_AVX);
+}
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h
new file mode 100644
index 0000000..3418a49
--- /dev/null
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2012 Dmitry Kazakov <dimula73 at gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef KOOPTIMIZEDCOMPOSITEOPFACTORYPERARCH_H
+#define KOOPTIMIZEDCOMPOSITEOPFACTORYPERARCH_H
+
+#include "config-vc.h"
+#ifndef HAVE_SANE_VC
+#error "BUG: There is no reason in including this file when Vc is not present"
+#endif
+
+#include "KoVcMultiArchBuildSupport.h"
+
+
+class KoCompositeOp;
+class KoColorSpace;
+
+struct KoOptimizedCompositeOpFactoryPerArchBase
+{
+    virtual ~KoOptimizedCompositeOpFactoryPerArchBase() {}
+    virtual KoCompositeOp* createAlphaDarkenOp32(const KoColorSpace *cs) = 0;
+    virtual KoCompositeOp* createOverOp32(const KoColorSpace *cs) = 0;
+    virtual void printArchInfo() = 0;
+};
+
+template<Vc::Implementation _impl>
+struct KoOptimizedCompositeOpFactoryPerArch : public KoOptimizedCompositeOpFactoryPerArchBase
+{
+    KoCompositeOp* createAlphaDarkenOp32(const KoColorSpace *cs);
+    KoCompositeOp* createOverOp32(const KoColorSpace *cs);
+    void printArchInfo();
+};
+
+#define DECLARE_FOR_ARCH(__arch)                                        \
+    template<> KoCompositeOp* KoOptimizedCompositeOpFactoryPerArch<__arch>::createAlphaDarkenOp32(const KoColorSpace *cs); \
+    template<> KoCompositeOp* KoOptimizedCompositeOpFactoryPerArch<__arch>::createOverOp32(const KoColorSpace *cs); \
+    template<> void KoOptimizedCompositeOpFactoryPerArch<__arch>::printArchInfo();
+
+DECLARE_FOR_ALL_ARCHS_NO_SCALAR(DECLARE_FOR_ARCH);
+#define createOptimizedCompositeOpFactory createOptimizedFactoryNoScalar<KoOptimizedCompositeOpFactoryPerArch, KoOptimizedCompositeOpFactoryPerArchBase>
+
+
+#endif /* KOOPTIMIZEDCOMPOSITEOPFACTORYPERARCH_H */
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h b/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
index 8d16f3e..5790cf5 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
@@ -21,7 +21,6 @@
 #ifndef KOOPTIMIZEDCOMPOSITEOPOVER32_H_
 #define KOOPTIMIZEDCOMPOSITEOPOVER32_H_
 
-#include "KoCompositeOpFunctions.h"
 #include "KoCompositeOpBase.h"
 
 #include "KoStreamedMath.h"
@@ -31,9 +30,7 @@ template<typename channels_type, typename pixel_type, bool alphaLocked, bool all
 struct OverCompositor32 {
     // \see docs in AlphaDarkenCompositor32
 
-#ifdef HAVE_SANE_VC
-
-    template<bool haveMask, bool src_aligned>
+    template<bool haveMask, bool src_aligned, Vc::Implementation _impl>
     static ALWAYS_INLINE void compositeVector(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, float flow)
     {
         Q_UNUSED(flow);
@@ -41,7 +38,7 @@ struct OverCompositor32 {
         Vc::float_v src_alpha;
         Vc::float_v dst_alpha;
 
-        src_alpha = KoStreamedMath::fetch_alpha_32<src_aligned>(src);
+        src_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<src_aligned>(src);
 
         bool haveOpacity = opacity != 1.0;
         Vc::float_v opacity_norm_vec(opacity);
@@ -54,7 +51,7 @@ struct OverCompositor32 {
         src_alpha *= opacity_norm_vec;
 
         if (haveMask) {
-            Vc::float_v mask_vec = KoStreamedMath::fetch_mask_8(mask);
+            Vc::float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
             src_alpha *= mask_vec * uint8MaxRec1;
         }
 
@@ -64,7 +61,7 @@ struct OverCompositor32 {
             return;
         }
 
-        dst_alpha = KoStreamedMath::fetch_alpha_32<true>(dst);
+        dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<true>(dst);
 
         Vc::float_v src_c1;
         Vc::float_v src_c2;
@@ -75,7 +72,7 @@ struct OverCompositor32 {
         Vc::float_v dst_c3;
 
 
-        KoStreamedMath::fetch_colors_32<src_aligned>(src, src_c1, src_c2, src_c3);
+        KoStreamedMath<_impl>::template fetch_colors_32<src_aligned>(src, src_c1, src_c2, src_c3);
         Vc::float_v src_blend;
         Vc::float_v new_alpha;
 
@@ -97,7 +94,7 @@ struct OverCompositor32 {
         }
 
         if (!(src_blend == oneValue).isFull()) {
-            KoStreamedMath::fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
+            KoStreamedMath<_impl>::template fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
 
             dst_c1 = src_blend * (src_c1 - dst_c1) + dst_c1;
             dst_c2 = src_blend * (src_c2 - dst_c2) + dst_c2;
@@ -116,12 +113,10 @@ struct OverCompositor32 {
             }
         }
 
-        KoStreamedMath::write_channels_32(dst, new_alpha, dst_c1, dst_c2, dst_c3);
+        KoStreamedMath<_impl>::write_channels_32(dst, new_alpha, dst_c1, dst_c2, dst_c3);
     }
 
-#endif /* HAVE_SANE_VC */
-
-    template <bool haveMask>
+    template <bool haveMask, Vc::Implementation _impl>
     static ALWAYS_INLINE void compositeOnePixelScalar(const channels_type *src, channels_type *dst, const quint8 *mask, float opacity, float flow, const QBitArray &channelFlags)
     {
         Q_UNUSED(flow);
@@ -165,9 +160,9 @@ struct OverCompositor32 {
                     pixel_type *d = reinterpret_cast<pixel_type*>(dst);
                     *d = *s;
                 } else if (srcBlendNorm != 0.0){
-                    dst[0] = KoStreamedMath::lerp_mixed_u8_float(dst[0], src[0], srcBlendNorm);
-                    dst[1] = KoStreamedMath::lerp_mixed_u8_float(dst[1], src[1], srcBlendNorm);
-                    dst[2] = KoStreamedMath::lerp_mixed_u8_float(dst[2], src[2], srcBlendNorm);
+                    dst[0] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[0], src[0], srcBlendNorm);
+                    dst[1] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[1], src[1], srcBlendNorm);
+                    dst[2] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[2], src[2], srcBlendNorm);
                 }
             } else {
                 if (srcBlendNorm == 1.0) {
@@ -175,9 +170,9 @@ struct OverCompositor32 {
                     if(channelFlags.at(1)) dst[1] = src[1];
                     if(channelFlags.at(2)) dst[2] = src[2];
                 } else if (srcBlendNorm != 0.0) {
-                    if(channelFlags.at(0)) dst[0] = KoStreamedMath::lerp_mixed_u8_float(dst[0], src[0], srcBlendNorm);
-                    if(channelFlags.at(1)) dst[1] = KoStreamedMath::lerp_mixed_u8_float(dst[1], src[1], srcBlendNorm);
-                    if(channelFlags.at(2)) dst[2] = KoStreamedMath::lerp_mixed_u8_float(dst[2], src[2], srcBlendNorm);
+                    if(channelFlags.at(0)) dst[0] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[0], src[0], srcBlendNorm);
+                    if(channelFlags.at(1)) dst[1] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[1], src[1], srcBlendNorm);
+                    if(channelFlags.at(2)) dst[2] = KoStreamedMath<_impl>::lerp_mixed_u8_float(dst[2], src[2], srcBlendNorm);
                 }
             }
 
@@ -193,6 +188,7 @@ struct OverCompositor32 {
  * colorspaces with alpha channel placed at the last byte of
  * the pixel: C1_C2_C3_A.
  */
+template<Vc::Implementation _impl>
 class KoOptimizedCompositeOpOver32 : public KoCompositeOp
 {
 public:
@@ -215,7 +211,7 @@ public:
         if (params.channelFlags.isEmpty() ||
             params.channelFlags == QBitArray(4, true)) {
 
-            KoStreamedMath::genericComposite32<haveMask, false, OverCompositor32<quint8, quint32, false, true> >(params);
+            KoStreamedMath<_impl>::template genericComposite32<haveMask, false, OverCompositor32<quint8, quint32, false, true> >(params);
         } else {
             const bool allChannelsFlag =
                 params.channelFlags.at(0) &&
@@ -226,11 +222,11 @@ public:
                 !params.channelFlags.at(3);
 
             if (allChannelsFlag && alphaLocked) {
-                KoStreamedMath::genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, true, true> >(params);
+                KoStreamedMath<_impl>::template genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, true, true> >(params);
             } else if (!allChannelsFlag && !alphaLocked) {
-                KoStreamedMath::genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, false, false> >(params);
+                KoStreamedMath<_impl>::template genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, false, false> >(params);
             } else /*if (!allChannelsFlag && alphaLocked) */{
-                KoStreamedMath::genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, true, false> >(params);
+                KoStreamedMath<_impl>::template genericComposite32_novector<haveMask, false, OverCompositor32<quint8, quint32, true, false> >(params);
             }
         }
     }
diff --git a/libs/pigment/compositeops/KoStreamedMath.h b/libs/pigment/compositeops/KoStreamedMath.h
index 9d2dfa8..3f2594c 100644
--- a/libs/pigment/compositeops/KoStreamedMath.h
+++ b/libs/pigment/compositeops/KoStreamedMath.h
@@ -21,11 +21,12 @@
 
 
 #include "config-vc.h"
+#ifndef HAVE_SANE_VC
+#error "BUG: There is no reason in including this file when Vc is not present"
+#endif
 
-#ifdef HAVE_SANE_VC
 #include <Vc/Vc>
 #include <Vc/IO>
-#endif
 
 #include <stdint.h>
 
@@ -39,13 +40,14 @@
 #endif
 #endif
 
-namespace KoStreamedMath {
+template<Vc::Implementation _impl>
+struct KoStreamedMath {
 
 /**
  * Composes src into dst without using vector instructions
  */
 template<bool useMask, bool useFlow, class Compositor>
-    void genericComposite32_novector(const KoCompositeOp::ParameterInfo& params)
+    static void genericComposite32_novector(const KoCompositeOp::ParameterInfo& params)
 {
     using namespace Arithmetic;
 
@@ -64,7 +66,7 @@ template<bool useMask, bool useFlow, class Compositor>
         int blockRest = params.cols;
 
         for(int i = 0; i < blockRest; i++) {
-            Compositor::template compositeOnePixelScalar<useMask>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
+            Compositor::template compositeOnePixelScalar<useMask, _impl>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
             src += srcLinearInc;
             dst += linearInc;
 
@@ -86,13 +88,11 @@ static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha) {
     return quint8(qint16(b - a) * alpha + a);
 }
 
-#if defined HAVE_SANE_VC
-
 /**
  * Get a vector containing first Vc::float_v::Size values of mask.
  * Each source mask element is considered to be a 8-bit integer
  */
-inline Vc::float_v fetch_mask_8(const quint8 *data) {
+static inline Vc::float_v fetch_mask_8(const quint8 *data) {
     Vc::uint_v data_i(data);
     return Vc::float_v(Vc::int_v(data_i));
 }
@@ -110,7 +110,7 @@ inline Vc::float_v fetch_mask_8(const quint8 *data) {
  *               causes #GP (General Protection Exception)
  */
 template <bool aligned>
-inline Vc::float_v fetch_alpha_32(const quint8 *data) {
+static inline Vc::float_v fetch_alpha_32(const quint8 *data) {
     Vc::uint_v data_i;
     if (aligned) {
         data_i.load((const quint32*)data, Vc::Aligned);
@@ -134,7 +134,7 @@ inline Vc::float_v fetch_alpha_32(const quint8 *data) {
  *               causes #GP (General Protection Exception)
  */
 template <bool aligned>
-inline void fetch_colors_32(const quint8 *data,
+static inline void fetch_colors_32(const quint8 *data,
                             Vc::float_v &c1,
                             Vc::float_v &c2,
                             Vc::float_v &c3) {
@@ -161,7 +161,7 @@ inline void fetch_colors_32(const quint8 *data,
  *
  * NOTE: \p data must be aligned pointer!
  */
-inline void write_channels_32(quint8 *data,
+static inline void write_channels_32(quint8 *data,
                               Vc::float_v alpha,
                               Vc::float_v c1,
                               Vc::float_v c2,
@@ -191,7 +191,7 @@ inline void write_channels_32(quint8 *data,
  * math of the composition
  */
 template<bool useMask, bool useFlow, class Compositor>
-    void genericComposite32(const KoCompositeOp::ParameterInfo& params)
+    static void genericComposite32(const KoCompositeOp::ParameterInfo& params)
 {
     using namespace Arithmetic;
 
@@ -251,7 +251,7 @@ template<bool useMask, bool useFlow, class Compositor>
         }
 
         for(int i = 0; i < blockAlign; i++) {
-            Compositor::template compositeOnePixelScalar<useMask>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
+            Compositor::template compositeOnePixelScalar<useMask, _impl>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
             src += srcLinearInc;
             dst += linearInc;
 
@@ -261,7 +261,7 @@ template<bool useMask, bool useFlow, class Compositor>
         }
 
         for (int i = 0; i < blockAlignedVector; i++) {
-            Compositor::template compositeVector<useMask, true>(src, dst, mask, params.opacity, params.flow);
+            Compositor::template compositeVector<useMask, true, _impl>(src, dst, mask, params.opacity, params.flow);
             src += srcVectorInc;
             dst += vectorInc;
 
@@ -271,7 +271,7 @@ template<bool useMask, bool useFlow, class Compositor>
         }
 
         for (int i = 0; i < blockUnalignedVector; i++) {
-            Compositor::template compositeVector<useMask, false>(src, dst, mask, params.opacity, params.flow);
+            Compositor::template compositeVector<useMask, false, _impl>(src, dst, mask, params.opacity, params.flow);
             src += srcVectorInc;
             dst += vectorInc;
 
@@ -282,7 +282,7 @@ template<bool useMask, bool useFlow, class Compositor>
 
 
         for(int i = 0; i < blockRest; i++) {
-            Compositor::template compositeOnePixelScalar<useMask>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
+            Compositor::template compositeOnePixelScalar<useMask, _impl>(src, dst, mask, params.opacity, params.flow, params.channelFlags);
             src += srcLinearInc;
             dst += linearInc;
 
@@ -304,24 +304,6 @@ template<bool useMask, bool useFlow, class Compositor>
     }
 }
 
-#else /* if ! defined HAVE_SANE_VC */
-
-/**
- * Fall back to the scalar version of the composition.
- *
- * Don't use this method! The scalar floating point version of the
- * algorithm is up to 2 times slower then the basic integer
- * implementation! Use another composite op instead!
- */
-
-template<bool useMask, bool useFlow, class Compositor>
-    void genericComposite32(const KoCompositeOp::ParameterInfo& params)
-{
-    genericComposite32_novector<useMask, useFlow, Compositor>(params);
-}
-
-#endif /* HAVE_SANE_VC */
-
-}
+};
 
 #endif /* __VECTOR_MATH_H */
diff --git a/libs/pigment/compositeops/KoVcMultiArchBuildSupport.h b/libs/pigment/compositeops/KoVcMultiArchBuildSupport.h
new file mode 100644
index 0000000..28d7717
--- /dev/null
+++ b/libs/pigment/compositeops/KoVcMultiArchBuildSupport.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2012 Dmitry Kazakov <dimula73 at gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __KOVCMULTIARCHBUILDSUPPORT_H
+#define __KOVCMULTIARCHBUILDSUPPORT_H
+
+#include <Vc/Vc>
+#include <Vc/common/support.h>
+
+template<template<Vc::Implementation _impl> class FactoryType, class ReturnType>
+    ReturnType* createOptimizedFactoryNoScalar()
+{
+    /*if (Vc::isImplementationSupported(Vc::Fma4Impl)) {
+        return new FactoryType<Vc::Fma4Impl>();
+    } else if (Vc::isImplementationSupported(Vc::XopImpl)) {
+        return new FactoryType<Vc::XopImpl>();
+        } else*/
+    if (Vc::isImplementationSupported(Vc::AVXImpl)) {
+        return new FactoryType<Vc::AVXImpl>();
+    } else if (Vc::isImplementationSupported(Vc::SSE42Impl)) {
+        return new FactoryType<Vc::SSE42Impl>();
+    } else if (Vc::isImplementationSupported(Vc::SSE41Impl)) {
+        return new FactoryType<Vc::SSE41Impl>();
+    } else if (Vc::isImplementationSupported(Vc::SSE4aImpl)) {
+        return new FactoryType<Vc::SSE4aImpl>();
+    } else if (Vc::isImplementationSupported(Vc::SSSE3Impl)) {
+        return new FactoryType<Vc::SSSE3Impl>();
+    } else if (Vc::isImplementationSupported(Vc::SSE3Impl)) {
+        return new FactoryType<Vc::SSE3Impl>();
+    } else if (Vc::isImplementationSupported(Vc::SSE2Impl)) {
+        return new FactoryType<Vc::SSE2Impl>();
+    }
+
+    return 0;
+}
+
+template<template<Vc::Implementation _impl> class FactoryType, class ReturnType>
+    ReturnType* createOptimizedFactory()
+{
+    ReturnType *f = createOptimizedFactoryNoScalar<FactoryType, ReturnType>();
+    return f ? f : new FactoryType<Vc::ScalarImpl>();
+}
+
+#define DECLARE_FOR_ALL_ARCHS_NO_SCALAR(_DECL)   \
+    _DECL(Vc::SSE2Impl);                         \
+    _DECL(Vc::SSE3Impl);                         \
+    _DECL(Vc::SSSE3Impl);                        \
+    _DECL(Vc::SSE41Impl);                        \
+    _DECL(Vc::SSE42Impl);                        \
+    _DECL(Vc::SSE4aImpl);                        \
+    _DECL(Vc::AVXImpl);/*                        \
+    _DECL(Vc::XopImpl);                          \
+    _DECL(Vc::Fma4Impl);*/
+
+#define DECLARE_FOR_ALL_ARCHS(_DECL)             \
+    DECLARE_FOR_ALL_ARCHS_NO_SCALAR(_DECL);      \
+    _DECL(Vc::ScalarImpl);
+
+#endif /* __KOVCMULTIARCHBUILDSUPPORT_H */