[graphics/krita] /: [FEATURE] Port Krita to xsimd
L. E. Segovia
null at kde.org
Thu May 19 19:20:07 BST 2022
Git commit 70863966699379856e7fa08d0d8fc2aee4342c29 by L. E. Segovia.
Committed on 19/05/2022 at 17:40.
Pushed by lsegovia into branch 'master'.
[FEATURE] Port Krita to xsimd
This commit replaces our Vc backend with xsimd, QuantStack's SIMD
offering backing the xtensor project.
This requires the following:
- wholesale replacement of Vc with xsimd alternatives
- implementation of dispatching against the xsimd::generic architecture
(i.e. Vc::ScalarImpl)
- implementation of rudimentary gather-scatter and integer rounding ops
This brings Arm support to our backend for Free(tm).
Co-Authored-By: Dmitry Kazakov <dimula73 at gmail.com>
CCMAIL: kimageshop at kde.org
M +2 -2 3rdparty/ext_xsimd/CMakeLists.txt
M +35 -30 CMakeLists.txt
M +3 -11 benchmarks/CMakeLists.txt
M +80 -60 benchmarks/kis_composition_benchmark.cpp
M +2 -0 benchmarks/kis_composition_benchmark.h
A +20 -0 cmake/modules/Findxsimd.cmake
A +105 -0 cmake/modules/xsimd/xsimdAddCompilerFlag.cmake
A +267 -0 cmake/modules/xsimd/xsimdMacros.cmake
D +0 -4 config-vc.h.cmake
A +4 -0 config-xsimd.h.cmake
M +1 -1 krita/data/aboutdata/libraries.txt
M +1 -0 libs/CMakeLists.txt
M +1 -6 libs/brush/CMakeLists.txt
M +0 -6 libs/brush/tests/CMakeLists.txt
M +2 -5 libs/image/CMakeLists.txt
M +103 -79 libs/image/kis_antialiasing_fade_maker.h
M +1 -1 libs/image/kis_base_mask_generator.cpp
M +14 -15 libs/image/kis_brush_mask_applicator_factories.cpp
M +4 -3 libs/image/kis_brush_mask_applicator_factories.h
M +14 -14 libs/image/kis_brush_mask_applicator_factories_Scalar.cpp
M +275 -264 libs/image/kis_brush_mask_processor_factories.cpp
M +1 -1 libs/image/kis_brush_mask_scalar_applicator.h
M +15 -15 libs/image/kis_brush_mask_vector_applicator.h
M +2 -2 libs/image/kis_circle_mask_generator.h
M +2 -2 libs/image/kis_curve_circle_mask_generator.h
M +2 -2 libs/image/kis_curve_rect_mask_generator.h
M +2 -2 libs/image/kis_gauss_circle_mask_generator.h
M +2 -2 libs/image/kis_gauss_rect_mask_generator.h
M +2 -2 libs/image/kis_rect_mask_generator.h
M +0 -4 libs/image/tests/CMakeLists.txt
M +26 -23 libs/image/vc_extra_math.h
A +9 -0 libs/multiarch/CMakeLists.txt
A +133 -0 libs/multiarch/xsimd_extensions/arch/xsimd_generic.hpp [License: BSD]
A +105 -0 libs/multiarch/xsimd_extensions/arch/xsimd_generic_details.hpp [License: BSD]
A +18 -0 libs/multiarch/xsimd_extensions/arch/xsimd_isa.hpp [License: BSD]
A +106 -0 libs/multiarch/xsimd_extensions/config/xsimd_arch.hpp [License: BSD]
A +47 -0 libs/multiarch/xsimd_extensions/config/xsimd_config.hpp [License: BSD]
A +18 -0 libs/multiarch/xsimd_extensions/xsimd.hpp [License: BSD]
M +2 -4 libs/pigment/CMakeLists.txt
M +32 -33 libs/pigment/KoAlphaMaskApplicator.h
M +17 -17 libs/pigment/KoAlphaMaskApplicatorFactoryImpl.cpp
M +2 -2 libs/pigment/KoAlphaMaskApplicatorFactoryImpl.h
M +2 -2 libs/pigment/KoOptimizedPixelDataScalerU8ToU16.h
M +3 -2 libs/pigment/KoOptimizedPixelDataScalerU8ToU16FactoryImpl.cpp
M +2 -2 libs/pigment/KoOptimizedPixelDataScalerU8ToU16FactoryImpl.h
A +86 -0 libs/pigment/compositeops/KoMultiArchBuildSupport.h [License: LGPL(v2.1+)]
M +43 -43 libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken128.h
M +50 -54 libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
M +37 -33 libs/pigment/compositeops/KoOptimizedCompositeOpCopy128.h
M +25 -32 libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp
M +19 -20 libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h
M +13 -12 libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch_Scalar.cpp
M +36 -28 libs/pigment/compositeops/KoOptimizedCompositeOpOver128.h
M +28 -57 libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
M +511 -533 libs/pigment/compositeops/KoStreamedMath.h
D +0 -122 libs/pigment/compositeops/KoVcMultiArchBuildSupport.h
M +3 -3 libs/pigment/tests/KoRgbU8ColorSpaceTester.cpp
https://invent.kde.org/graphics/krita/commit/70863966699379856e7fa08d0d8fc2aee4342c29
diff --git a/3rdparty/ext_xsimd/CMakeLists.txt b/3rdparty/ext_xsimd/CMakeLists.txt
index d0afc5704c..9c7f59354b 100644
--- a/3rdparty/ext_xsimd/CMakeLists.txt
+++ b/3rdparty/ext_xsimd/CMakeLists.txt
@@ -2,8 +2,8 @@ SET(EXTPREFIX_xsimd "${EXTPREFIX}" )
ExternalProject_Add( ext_xsimd
DOWNLOAD_DIR ${EXTERNALS_DOWNLOAD_DIR}
- URL https://github.com/xtensor-stack/xsimd/archive/8.0.5/xsimd-8.0.5.tar.gz
- URL_HASH SHA512=165ca307bcaccc226dd518562a9269aa9fc44c70fab299ea0827d7a0a9cf58ec493b381fcf2c4d9d1f6edfda8fa309643c63b17fb20d9de78a4be19eb70e30b3
+ URL https://github.com/xtensor-stack/xsimd/archive/5647254635ea6f590b7366e0848a8a0ab3007efa.tar.gz
+ URL_HASH SHA256=5b175c8b671d4287343b0e79e9e3a8070992815f5e0ed44bfc9bce01f8880814
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${EXTPREFIX_xsimd} -DCMAKE_BUILD_TYPE=${GLOBAL_BUILD_TYPE} ${GLOBAL_PROFILE}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8aa2951d81..124a53cf20 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -960,31 +960,19 @@ endif()
list (APPEND ANDROID_EXTRA_LIBS ${LCMS2_LIBRARIES})
##
-## Test for Vc
+## Test for xsimd
##
-set(OLD_CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} )
-set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules )
-set(HAVE_VC FALSE)
-if (NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|riscv64.*|RISCV64.*)"
- OR CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)"))
- find_package(Vc 1.1.0)
- set_package_properties(Vc PROPERTIES
- DESCRIPTION "Portable, zero-overhead SIMD library for C++"
- URL "https://github.com/VcDevel/Vc"
- TYPE OPTIONAL
- PURPOSE "Required by the Krita for vectorization")
- macro_bool_to_01(Vc_FOUND HAVE_VC)
-endif()
-configure_file(config-vc.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config-vc.h )
-
-if(HAVE_VC)
- message(STATUS "Vc found!")
- set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
- "${CMAKE_SOURCE_DIR}/cmake/vc")
-
- include (VcMacros)
+find_package(xsimd 8.0.5)
+set_package_properties(xsimd PROPERTIES
+ DESCRIPTION "C++ wrappers for SIMD intrinsics"
+ URL "https://github.com/xtensor-stack/xsimd"
+ TYPE OPTIONAL
+ PURPOSE "Required by Krita for vectorization")
+macro_bool_to_01(xsimd_FOUND HAVE_XSIMD)
+configure_file(config-xsimd.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config-xsimd.h )
- if(Vc_COMPILER_IS_CLANG)
+if(HAVE_XSIMD)
+ if(xsimd_COMPILER_IS_CLANG)
set(ADDITIONAL_VC_FLAGS "-ffp-contract=fast")
if(NOT WIN32)
set(ADDITIONAL_VC_FLAGS "${ADDITIONAL_VC_FLAGS} -fPIC")
@@ -996,15 +984,32 @@ if(HAVE_VC)
endif()
endif()
- macro(ko_compile_for_all_implementations_no_scalar _objs _src)
- vc_compile_for_all_implementations(${_objs} ${_src} FLAGS ${ADDITIONAL_VC_FLAGS} ONLY SSE2 SSSE3 SSE4_1 AVX AVX2+FMA+BMI2)
- endmacro()
+ if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
+ macro(ko_compile_for_all_implementations_no_scalar _objs _src)
+ xsimd_compile_for_all_implementations(${_objs} ${_src} FLAGS ${ADDITIONAL_VC_FLAGS} ONLY NEON)
+ endmacro()
+
+ macro(ko_compile_for_all_implementations _objs _src)
+ xsimd_compile_for_all_implementations(${_objs} ${_src} FLAGS ${ADDITIONAL_VC_FLAGS} ONLY Scalar NEON)
+ endmacro()
+ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
+ macro(ko_compile_for_all_implementations_no_scalar _objs _src)
+ xsimd_compile_for_all_implementations(${_objs} ${_src} FLAGS ${ADDITIONAL_VC_FLAGS} ONLY NEON64)
+ endmacro()
+
+ macro(ko_compile_for_all_implementations _objs _src)
+ xsimd_compile_for_all_implementations(${_objs} ${_src} FLAGS ${ADDITIONAL_VC_FLAGS} ONLY Scalar NEON64)
+ endmacro()
+ else()
+ macro(ko_compile_for_all_implementations_no_scalar _objs _src)
+ xsimd_compile_for_all_implementations(${_objs} ${_src} FLAGS ${ADDITIONAL_VC_FLAGS} ONLY SSE2 SSSE3 SSE4_1 AVX AVX2+FMA)
+ endmacro()
- macro(ko_compile_for_all_implementations _objs _src)
- vc_compile_for_all_implementations(${_objs} ${_src} FLAGS ${ADDITIONAL_VC_FLAGS} ONLY Scalar SSE2 SSSE3 SSE4_1 AVX AVX2+FMA+BMI2)
- endmacro()
+ macro(ko_compile_for_all_implementations _objs _src)
+ xsimd_compile_for_all_implementations(${_objs} ${_src} FLAGS ${ADDITIONAL_VC_FLAGS} ONLY Scalar SSE2 SSSE3 SSE4_1 AVX AVX2+FMA)
+ endmacro()
+ endif()
endif()
-set(CMAKE_MODULE_PATH ${OLD_CMAKE_MODULE_PATH} )
##
## Test endianness
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 5be0d778ef..54b63ce3d7 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -7,14 +7,6 @@ include_directories(SYSTEM
${EIGEN3_INCLUDE_DIR}
)
-
-set(LINK_VC_LIB)
-if(HAVE_VC)
- include_directories(${Vc_INCLUDE_DIR})
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Vc_DEFINITIONS}")
- set(LINK_VC_LIB ${Vc_LIBRARIES})
-endif()
-
macro_add_unittest_definitions()
########### next target ###############
@@ -76,9 +68,9 @@ target_link_libraries(KisLowMemoryBenchmark kritaimage Qt5::Test)
target_link_libraries(KisAnimationRenderingBenchmark kritaimage kritaui Qt5::Test)
target_link_libraries(KisFilterSelectionsBenchmark kritaimage Qt5::Test)
-target_link_libraries(KisCompositionBenchmark kritaimage Qt5::Test ${LINK_VC_LIB})
-if(HAVE_VC)
- set_property(TARGET KisCompositionBenchmark APPEND PROPERTY COMPILE_OPTIONS "${Vc_ARCHITECTURE_FLAGS}")
+target_link_libraries(KisCompositionBenchmark kritaimage Qt5::Test kritamultiarch)
+if(HAVE_XSIMD)
+ set_property(TARGET KisCompositionBenchmark APPEND PROPERTY COMPILE_OPTIONS "${xsimd_ARCHITECTURE_FLAGS}")
endif()
target_link_libraries(KisMaskGeneratorBenchmark kritaimage Qt5::Test)
target_link_libraries(KisThumbnailBenchmark kritaimage Qt5::Test)
diff --git a/benchmarks/kis_composition_benchmark.cpp b/benchmarks/kis_composition_benchmark.cpp
index 43ba80b387..a9489a641b 100644
--- a/benchmarks/kis_composition_benchmark.cpp
+++ b/benchmarks/kis_composition_benchmark.cpp
@@ -2,25 +2,14 @@
* SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73 at gmail.com>
* SPDX-FileCopyrightText: 2015 Thorsten Zachmann <zachmann at kde.org>
* SPDX-FileCopyrightText: 2020 Mathias Wein <lynx.mw+kde at gmail.com>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
// for calculation of the needed alignment
-#include <config-vc.h>
-#ifdef HAVE_VC
-#if defined _MSC_VER
-// Lets shut up the "possible loss of data" and "forcing value to bool 'true' or 'false'
-#pragma warning ( push )
-#pragma warning ( disable : 4244 )
-#pragma warning ( disable : 4800 )
-#endif
-#include <Vc/Vc>
-#include <Vc/IO>
-#if defined _MSC_VER
-#pragma warning ( pop )
-#endif
-
+#include <xsimd_extensions/xsimd.hpp>
+#ifdef HAVE_XSIMD
#include <KoOptimizedCompositeOpOver32.h>
#include <KoOptimizedCompositeOpOver128.h>
#include <KoOptimizedCompositeOpCopy128.h>
@@ -55,6 +44,10 @@
#define MEMALIGN_FREE(p) free((p))
#endif
+#ifdef HAVE_XSIMD
+using float_v = xsimd::batch<float, xsimd::current_arch>;
+#endif
+
enum AlphaRange {
ALPHA_ZERO,
ALPHA_UNIT,
@@ -238,8 +231,8 @@ QVector<Tile> generateTiles(int size,
{
QVector<Tile> tiles(size);
-#ifdef HAVE_VC
- const int vecSize = Vc::float_v::size();
+#ifdef HAVE_XSIMD
+ const int vecSize = float_v::size;
#else
const int vecSize = 1;
#endif
@@ -512,7 +505,7 @@ void benchmarkCompositeOp(const KoCompositeOp *op, const QString &postfix)
benchmarkCompositeOp(op, false, 1.0, 1.0, 0, 0, ALPHA_UNIT, ALPHA_UNIT);
}
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
template <typename channels_type>
void printError(quint8 *s, quint8 *d1, quint8 *d2, quint8 *msk1, int pos)
@@ -535,7 +528,7 @@ void checkRounding(qreal opacity, qreal flow, qreal averageOpacity = -1, quint32
QVector<Tile> tiles =
generateTiles(2, 0, 0, ALPHA_RANDOM, ALPHA_RANDOM, pixelSize);
- const int vecSize = Vc::float_v::size();
+ const int vecSize = float_v::size;
const int numBlocks = numPixels / vecSize;
@@ -562,7 +555,7 @@ void checkRounding(qreal opacity, qreal flow, qreal averageOpacity = -1, quint32
// The error count is needed as 38.5 gets rounded to 38 instead of 39 in the vc version.
int errorcount = 0;
for (int i = 0; i < numBlocks; i++) {
- Compositor::template compositeVector<true,true, Vc::CurrentImplementation::current()>(src1, dst1, msk1, params.opacity, paramsWrapper);
+ Compositor::template compositeVector<true,true, xsimd::current_arch>(src1, dst1, msk1, params.opacity, paramsWrapper);
for (int j = 0; j < vecSize; j++) {
//if (8 * i + j == 7080) {
@@ -571,7 +564,7 @@ void checkRounding(qreal opacity, qreal flow, qreal averageOpacity = -1, quint32
// qDebug() << "msk:" << msk2[0];
//}
- Compositor::template compositeOnePixelScalar<true, Vc::CurrentImplementation::current()>(src2, dst2, msk2, params.opacity, paramsWrapper);
+ Compositor::template compositeOnePixelScalar<true, xsimd::current_arch>(src2, dst2, msk2, params.opacity, paramsWrapper);
bool compareResult = true;
if (pixelSize == 4) {
@@ -623,107 +616,134 @@ void checkRounding(qreal opacity, qreal flow, qreal averageOpacity = -1, quint32
#endif
+void KisCompositionBenchmark::detectBuildArchitecture()
+{
+#ifdef HAVE_XSIMD
+ using namespace xsimd;
+
+ qDebug() << "built for" << ppVar(current_arch().name());
+ qDebug() << "built for" << ppVar(default_arch().name());
+
+ qDebug() << ppVar(supported_architectures().contains<sse2>());
+ qDebug() << ppVar(supported_architectures().contains<sse3>());
+ qDebug() << ppVar(supported_architectures().contains<ssse3>());
+ qDebug() << ppVar(supported_architectures().contains<sse4_1>());
+ qDebug() << ppVar(supported_architectures().contains<sse4_2>());
+ qDebug() << ppVar(supported_architectures().contains<fma3<sse4_2>>());
+
+ qDebug() << ppVar(supported_architectures().contains<avx>());
+ qDebug() << ppVar(supported_architectures().contains<avx2>());
+ qDebug() << ppVar(supported_architectures().contains<fma3<avx2>>());
+ qDebug() << ppVar(supported_architectures().contains<fma4>());
+ qDebug() << ppVar(supported_architectures().contains<avx512f>());
+ qDebug() << ppVar(supported_architectures().contains<avx512bw>());
+ qDebug() << ppVar(supported_architectures().contains<avx512dq>());
+ qDebug() << ppVar(supported_architectures().contains<avx512cd>());
+ qDebug().nospace() << "running on " << hex << "0x" << xsimd::available_architectures().best;
+#endif
+}
+
void KisCompositionBenchmark::checkRoundingAlphaDarken_05_03()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<AlphaDarkenCompositor32<quint8, quint32, KoAlphaDarkenParamsWrapperCreamy> >(0.5,0.3);
#endif
}
void KisCompositionBenchmark::checkRoundingAlphaDarken_05_05()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<AlphaDarkenCompositor32<quint8, quint32, KoAlphaDarkenParamsWrapperCreamy> >(0.5,0.5);
#endif
}
void KisCompositionBenchmark::checkRoundingAlphaDarken_05_07()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<AlphaDarkenCompositor32<quint8, quint32, KoAlphaDarkenParamsWrapperCreamy> >(0.5,0.7);
#endif
}
void KisCompositionBenchmark::checkRoundingAlphaDarken_05_10()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<AlphaDarkenCompositor32<quint8, quint32, KoAlphaDarkenParamsWrapperCreamy> >(0.5,1.0);
#endif
}
void KisCompositionBenchmark::checkRoundingAlphaDarken_05_10_08()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<AlphaDarkenCompositor32<quint8, quint32, KoAlphaDarkenParamsWrapperCreamy> >(0.5,1.0,0.8);
#endif
}
void KisCompositionBenchmark::checkRoundingAlphaDarkenF32_05_03()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<OverCompositor128<float, false, true> >(0.5, 0.3, -1, 16);
#endif
}
void KisCompositionBenchmark::checkRoundingAlphaDarkenF32_05_05()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<OverCompositor128<float, false, true> >(0.5, 0.5, -1, 16);
#endif
}
void KisCompositionBenchmark::checkRoundingAlphaDarkenF32_05_07()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<OverCompositor128<float, false, true> >(0.5, 0.7, -1, 16);
#endif
}
void KisCompositionBenchmark::checkRoundingAlphaDarkenF32_05_10()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<OverCompositor128<float, false, true> >(0.5, 1.0, -1, 16);
#endif
}
void KisCompositionBenchmark::checkRoundingAlphaDarkenF32_05_10_08()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<OverCompositor128<float, false, true> >(0.5, 1.0, 0.8, 16);
#endif
}
void KisCompositionBenchmark::checkRoundingOver()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<OverCompositor32<quint8, quint32, false, true> >(0.5, 0.3);
#endif
}
void KisCompositionBenchmark::checkRoundingOverRgbaU16()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<OverCompositor128<quint16, false, true> >(0.5, 1.0, -1, 8);
#endif
}
void KisCompositionBenchmark::checkRoundingOverRgbaF32()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<OverCompositor128<float, false, true> >(0.5, 1.0, -1, 16);
#endif
}
#include <cfenv>
void KisCompositionBenchmark::checkRoundingCopyRgbaU16()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<CopyCompositor128<quint16, false, true> >(0.5, 1.0, -1, 8);
#endif
}
void KisCompositionBenchmark::checkRoundingCopyRgbaF32()
{
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
checkRounding<CopyCompositor128<float, false, true> >(0.5, 1.0, -1, 16);
#endif
}
@@ -1036,8 +1056,8 @@ void KisCompositionBenchmark::benchmarkMemcpy()
freeTiles(tiles, 0, 0);
}
-#ifdef HAVE_VC
- const int vecSize = Vc::float_v::size();
+#ifdef HAVE_XSIMD
+ const int vecSize = float_v::size;
const size_t uint8VecAlignment = qMax(vecSize * sizeof(quint8), sizeof(void*));
const size_t uint32VecAlignment = qMax(vecSize * sizeof(quint32), sizeof(void*));
const size_t floatVecAlignment = qMax(vecSize * sizeof(float), sizeof(void*));
@@ -1045,8 +1065,8 @@ void KisCompositionBenchmark::benchmarkMemcpy()
void KisCompositionBenchmark::benchmarkUintFloat()
{
-#ifdef HAVE_VC
- using uint_v = Vc::SimdArray<unsigned int, Vc::float_v::size()>;
+#ifdef HAVE_XSIMD
+ using uint_v = xsimd::batch<unsigned int, xsimd::current_arch>;
const int dataSize = 4096;
void *ptr = 0;
@@ -1062,11 +1082,13 @@ void KisCompositionBenchmark::benchmarkUintFloat()
float *fData = (float*)ptr;
QBENCHMARK {
- for (int i = 0; i < dataSize; i += Vc::float_v::size()) {
+ for (int i = 0; i < dataSize; i += float_v::size) {
// convert uint -> float directly, this causes
// static_cast helper be called
- Vc::float_v b(uint_v(iData + i));
- b.store(fData + i);
+ const auto b = xsimd::batch_cast<typename float_v::value_type>(
+ xsimd::load_and_extend<uint_v>(iData + i)
+ );
+ b.store_aligned(fData + i);
}
}
@@ -1077,9 +1099,8 @@ void KisCompositionBenchmark::benchmarkUintFloat()
void KisCompositionBenchmark::benchmarkUintIntFloat()
{
-#ifdef HAVE_VC
- using int_v = Vc::SimdArray<int, Vc::float_v::size()>;
- using uint_v = Vc::SimdArray<unsigned int, Vc::float_v::size()>;
+#ifdef HAVE_XSIMD
+ using uint_v = xsimd::batch<unsigned int, xsimd::current_arch>;
const int dataSize = 4096;
void *ptr = 0;
@@ -1095,11 +1116,11 @@ void KisCompositionBenchmark::benchmarkUintIntFloat()
float *fData = (float*)ptr;
QBENCHMARK {
- for (int i = 0; i < dataSize; i += Vc::float_v::size()) {
+ for (int i = 0; i < dataSize; i += float_v::size) {
// convert uint->int->float, that avoids special sign
// treating, and gives 2.6 times speedup
- Vc::float_v b(int_v(uint_v(iData + i)));
- b.store(fData + i);
+ const auto b = xsimd::batch_cast<typename float_v::value_type>(xsimd::load_and_extend<uint_v>(iData + i));
+ b.store_aligned(fData + i);
}
}
@@ -1110,8 +1131,8 @@ void KisCompositionBenchmark::benchmarkUintIntFloat()
void KisCompositionBenchmark::benchmarkFloatUint()
{
-#ifdef HAVE_VC
- using uint_v = Vc::SimdArray<unsigned int, Vc::float_v::size()>;
+#ifdef HAVE_XSIMD
+ using uint_v = xsimd::batch<unsigned int, xsimd::current_arch>;
const int dataSize = 4096;
void *ptr = 0;
@@ -1127,11 +1148,12 @@ void KisCompositionBenchmark::benchmarkFloatUint()
float *fData = (float*)ptr;
QBENCHMARK {
- for (int i = 0; i < dataSize; i += Vc::float_v::size()) {
+ for (int i = 0; i < dataSize; i += float_v::size) {
// conversion float -> uint
- uint_v b(Vc::float_v(fData + i));
+ // this being a direct conversion, load_and_extend does not apply
+ const auto b = xsimd::batch_cast<typename uint_v::value_type>(float_v::load_aligned(fData + i));
- b.store(iData + i);
+ b.store_aligned(iData + i);
}
}
@@ -1142,10 +1164,8 @@ void KisCompositionBenchmark::benchmarkFloatUint()
void KisCompositionBenchmark::benchmarkFloatIntUint()
{
-#ifdef HAVE_VC
- using int_v = Vc::SimdArray<int, Vc::float_v::size()>;
- using uint_v = Vc::SimdArray<unsigned int, Vc::float_v::size()>;
-
+#ifdef HAVE_XSIMD
+ using uint_v = xsimd::batch<unsigned int, xsimd::current_arch>;
const int dataSize = 4096;
void *ptr = 0;
int error = MEMALIGN_ALLOC(&ptr, uint32VecAlignment, dataSize * sizeof(quint32));
@@ -1160,11 +1180,11 @@ void KisCompositionBenchmark::benchmarkFloatIntUint()
float *fData = (float*)ptr;
QBENCHMARK {
- for (int i = 0; i < dataSize; i += Vc::float_v::size()) {
+ for (int i = 0; i < dataSize; i += float_v::size) {
// conversion float -> int -> uint
- uint_v b(int_v(Vc::float_v(fData + i)));
+ const auto b = xsimd::batch_cast<typename uint_v::value_type>(float_v::load_aligned(fData + i));
- b.store(iData + i);
+ b.store_aligned(iData + i);
}
}
diff --git a/benchmarks/kis_composition_benchmark.h b/benchmarks/kis_composition_benchmark.h
index 1924aa310e..021f18646c 100644
--- a/benchmarks/kis_composition_benchmark.h
+++ b/benchmarks/kis_composition_benchmark.h
@@ -13,6 +13,8 @@ class KisCompositionBenchmark : public QObject
{
Q_OBJECT
private Q_SLOTS:
+ void detectBuildArchitecture();
+
void checkRoundingAlphaDarken_05_03();
void checkRoundingAlphaDarken_05_05();
void checkRoundingAlphaDarken_05_07();
diff --git a/cmake/modules/Findxsimd.cmake b/cmake/modules/Findxsimd.cmake
new file mode 100644
index 0000000000..0f93bf7e4e
--- /dev/null
+++ b/cmake/modules/Findxsimd.cmake
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
+# SPDX-License-Identifier: BSD-3-Clause
+
+include(FindPackageHandleStandardArgs)
+
+find_package(PkgConfig QUIET)
+pkg_check_modules(PC_xsimd QUIET xsimd)
+
+find_package(xsimd QUIET NO_MODULE
+ HINTS ${PC_xsimd_CONFIG_DIR} /usr/lib/cmake/xsimd /usr/local/lib/cmake/xsimd
+)
+
+if(xsimd_FOUND)
+ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/xsimd")
+ include(xsimdMacros)
+ xsimd_set_preferred_compiler_flags()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(xsimd CONFIG_MODE REQUIRED_VARS xsimd_IS_CONFIGURATION_VALID)
diff --git a/cmake/modules/xsimd/xsimdAddCompilerFlag.cmake b/cmake/modules/xsimd/xsimdAddCompilerFlag.cmake
new file mode 100644
index 0000000000..c79d3e0500
--- /dev/null
+++ b/cmake/modules/xsimd/xsimdAddCompilerFlag.cmake
@@ -0,0 +1,105 @@
+# - Add a given compiler flag to flags variables.
+# AddCompilerFlag(<flag> [<var>])
+# or
+# AddCompilerFlag(<flag> [C_FLAGS <var>] [CXX_FLAGS <var>] [C_RESULT <var>]
+# [CXX_RESULT <var>])
+
+#=============================================================================
+# SPDX-FileCopyrightText: 2010-2015 Matthias Kretz <kretz at kde.org>
+# SPDX-FileCopyrightText: 2021 L. E. Segovia <amy at amyspark.me
+# SPDX-License-Identifier: BSD-3-Clause
+#=============================================================================
+
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+
+macro(AddCompilerFlag _flag)
+ string(REGEX REPLACE "[-.+/:= ]" "_" _flag_esc "${_flag}")
+
+ set(_c_flags "CMAKE_C_FLAGS")
+ set(_cxx_flags "CMAKE_CXX_FLAGS")
+ set(_c_result tmp)
+ set(_cxx_result tmp)
+ if(${ARGC} EQUAL 2)
+ message(WARNING "Deprecated use of the AddCompilerFlag macro.")
+ unset(_c_result)
+ set(_cxx_result ${ARGV1})
+ elseif(${ARGC} GREATER 2)
+ set(state 0)
+ unset(_c_flags)
+ unset(_cxx_flags)
+ unset(_c_result)
+ unset(_cxx_result)
+ foreach(_arg ${ARGN})
+ if("x${_arg}" STREQUAL "xC_FLAGS")
+ set(state 1)
+ if(NOT DEFINED _c_result)
+ set(_c_result tmp0)
+ endif()
+ elseif("x${_arg}" STREQUAL "xCXX_FLAGS")
+ set(state 2)
+ if(NOT DEFINED _cxx_result)
+ set(_cxx_result tmp1)
+ endif()
+ elseif("x${_arg}" STREQUAL "xC_RESULT")
+ set(state 3)
+ elseif("x${_arg}" STREQUAL "xCXX_RESULT")
+ set(state 4)
+ elseif(state EQUAL 1)
+ set(_c_flags "${_arg}")
+ elseif(state EQUAL 2)
+ set(_cxx_flags "${_arg}")
+ elseif(state EQUAL 3)
+ set(_c_result "${_arg}")
+ elseif(state EQUAL 4)
+ set(_cxx_result "${_arg}")
+ else()
+ message(FATAL_ERROR "Syntax error for AddCompilerFlag")
+ endif()
+ endforeach()
+ endif()
+
+ set(_c_code "int main() { return 0; }")
+ set(_cxx_code "int main() { return 0; }")
+ if("${_flag}" STREQUAL "-mfma")
+ # Compiling with FMA3 support may fail only at the assembler level.
+ # In that case we need to have such an instruction in the test code
+ set(_c_code "#include <immintrin.h>
+ __m128 foo(__m128 x) { return _mm_fmadd_ps(x, x, x); }
+ int main() { return 0; }")
+ set(_cxx_code "${_c_code}")
+ elseif("${_flag}" STREQUAL "-stdlib=libc++")
+ # Compiling with libc++ not only requires a compiler that understands it, but also
+ # the libc++ headers itself
+ set(_cxx_code "#include <iostream>
+ #include <cstdio>
+ int main() { return 0; }")
+ else()
+ set(_cxx_code "#include <cstdio>
+ int main() { return 0; }")
+ endif()
+
+ if(DEFINED _c_result)
+ check_c_compiler_flag("${_flag}" check_c_compiler_flag_${_flag_esc} "${_c_code}")
+ set(${_c_result} ${check_c_compiler_flag_${_flag_esc}})
+ endif()
+ if(DEFINED _cxx_result)
+ check_cxx_compiler_flag("${_flag}" check_cxx_compiler_flag_${_flag_esc} "${_cxx_code}")
+ set(${_cxx_result} ${check_cxx_compiler_flag_${_flag_esc}})
+ endif()
+
+ macro(my_append _list _flag _special)
+ if("x${_list}" STREQUAL "x${_special}")
+ set(${_list} "${${_list}} ${_flag}")
+ else()
+ list(APPEND ${_list} "${_flag}")
+ endif()
+ endmacro()
+
+ if(check_c_compiler_flag_${_flag_esc} AND DEFINED _c_flags)
+ my_append(${_c_flags} "${_flag}" CMAKE_C_FLAGS)
+ endif()
+ if(check_cxx_compiler_flag_${_flag_esc} AND DEFINED _cxx_flags)
+ my_append(${_cxx_flags} "${_flag}" CMAKE_CXX_FLAGS)
+ endif()
+endmacro(AddCompilerFlag)
diff --git a/cmake/modules/xsimd/xsimdMacros.cmake b/cmake/modules/xsimd/xsimdMacros.cmake
new file mode 100644
index 0000000000..b493b7a421
--- /dev/null
+++ b/cmake/modules/xsimd/xsimdMacros.cmake
@@ -0,0 +1,267 @@
+# Macros for use with xsimd <https://github.com/xtensor-stack/xsimd>
+#
+# The following macros are provided:
+# xsimd_determine_compiler
+# xsimd_set_preferred_compiler_flags
+# xsimd_compile_for_all_implementations
+#
+#=============================================================================
+# SPDX-FileCopyrightText: 2010-2015 Matthias Kretz <kretz at kde.org>
+# SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
+# SPDX-License-Identifier: BSD-3-Clause
+#=============================================================================
+
+cmake_minimum_required(VERSION 3.12.0)
+
+include ("${CMAKE_CURRENT_LIST_DIR}/xsimdAddCompilerFlag.cmake")
+
+set(xsimd_IS_CONFIGURATION_VALID TRUE)
+mark_as_advanced(xsimd_IS_CONFIGURATION_VALID)
+
+macro(xsimd_determine_compiler)
+ set(xsimd_COMPILER_IS_CLANG false)
+ set(xsimd_COMPILER_IS_MSVC false)
+ set(xsimd_COMPILER_IS_GCC false)
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ set(xsimd_COMPILER_IS_CLANG true)
+ message(STATUS "Detected Compiler: Clang ${CMAKE_CXX_COMPILER_VERSION}")
+
+ # break build with too old clang as early as possible.
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0)
+ message(WARNING "xsimd requires at least clang 4.0")
+ set(xsimd_IS_CONFIGURATION_VALID FALSE)
+ endif()
+ elseif(MSVC)
+ set(xsimd_COMPILER_IS_MSVC true)
+ message(STATUS "Detected Compiler: MSVC ${MSVC_VERSION}")
+ # version detection of 2015 update 2 must be done against _MSC_FULL_VER == 190023918
+ file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/msvc_version.c" "MSVC _MSC_FULL_VER")
+ execute_process(COMMAND ${CMAKE_CXX_COMPILER} /nologo -EP "${CMAKE_CURRENT_BINARY_DIR}/msvc_version.c" OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE xsimd_MSVC_VERSION)
+ string(STRIP "${xsimd_MSVC_VERSION}" xsimd_MSVC_VERSION)
+ string(REPLACE "MSVC " "" xsimd_MSVC_VERSION "${xsimd_MSVC_VERSION}")
+ if (MSVC_VERSION LESS 1900 OR xsimd_MSVC_VERSION LESS 190023918)
+ message(WARNING "xsimd requires at least MSVC 2015 Update 2")
+ set(xsimd_IS_CONFIGURATION_VALID FALSE)
+ endif()
+ elseif(CMAKE_COMPILER_IS_GNUCXX)
+ set(xsimd_COMPILER_IS_GCC true)
+ message(STATUS "Detected Compiler: GCC ${CMAKE_CXX_COMPILER_VERSION}")
+
+ # break build with too old GCC as early as possible.
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9)
+ message(WARNING "xsimd requires at least GCC 4.9")
+ set(xsimd_IS_CONFIGURATION_VALID FALSE)
+ endif()
+ else()
+ message(WARNING "Untested/-supported Compiler (${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_VERSION}) for use with xsimd.\nPlease fill out the missing parts in the CMake scripts and submit a patch to https://invent.kde.org/graphics/krita")
+ endif()
+endmacro()
+
+macro(xsimd_check_assembler)
+ exec_program(${CMAKE_CXX_COMPILER} ARGS -print-prog-name=as OUTPUT_VARIABLE _as)
+ mark_as_advanced(_as)
+ if(NOT _as)
+ message(WARNING "Could not find 'as', the assembler used by GCC. Hoping everything will work out...")
+ else()
+ exec_program(${_as} ARGS --version OUTPUT_VARIABLE _as_version)
+ string(REGEX REPLACE "\\([^\\)]*\\)" "" _as_version "${_as_version}")
+ string(REGEX MATCH "[1-9]\\.[0-9]+(\\.[0-9]+)?" _as_version "${_as_version}")
+ if(_as_version VERSION_LESS "2.21.0")
+ message(WARNING "Your binutils is too old (${_as_version}) for reliably compiling xsimd.")
+ set(xsimd_IS_CONFIGURATION_VALID FALSE)
+ endif()
+ endif()
+endmacro()
+
+macro(xsimd_set_preferred_compiler_flags)
+ xsimd_determine_compiler()
+
+ if (NOT xsimd_COMPILER_IS_MSVC)
+ xsimd_check_assembler()
+ endif()
+
+ if(xsimd_COMPILER_IS_GCC)
+ AddCompilerFlag("-Wabi" CXX_FLAGS xsimd_ARCHITECTURE_FLAGS)
+ AddCompilerFlag("-fabi-version=0" CXX_FLAGS xsimd_ARCHITECTURE_FLAGS) # ABI version 4 is required to make __m128 and __m256 appear as different types. 0 should give us the latest version.
+ AddCompilerFlag("-fabi-compat-version=0" CXX_FLAGS xsimd_ARCHITECTURE_FLAGS) # GCC 5 introduced this switch
+ # and defaults it to 2 if -fabi-version is 0. But in that case the bug -fabi-version=0 is
+ # supposed to fix resurfaces. For now just make sure that it compiles and links.
+ elseif(xsimd_COMPILER_IS_MSVC)
+ AddCompilerFlag("/bigobj" CXX_FLAGS xsimd_ARCHITECTURE_FLAGS) # required for building tests with AVX
+ elseif(xsimd_COMPILER_IS_CLANG)
+ # disable these warnings because clang shows them for function overloads that were discarded via SFINAE
+ AddCompilerFlag("-Wno-local-type-template-args" CXX_FLAGS xsimd_ARCHITECTURE_FLAGS)
+ AddCompilerFlag("-Wno-unnamed-type-template-args" CXX_FLAGS xsimd_ARCHITECTURE_FLAGS)
+ endif()
+
+ if(xsimd_COMPILER_IS_MSVC)
+ AddCompilerFlag("/fp:fast" CXX_FLAGS xsimd_ARCHITECTURE_FLAGS)
+ else()
+ AddCompilerFlag("-ffp-contract=fast" CXX_FLAGS xsimd_ARCHITECTURE_FLAGS)
+
+ if (NOT WIN32)
+ AddCompilerFlag("-fPIC" CXX_FLAGS xsimd_ARCHITECTURE_FLAGS)
+ endif()
+ endif()
+endmacro()
+
+# helper macro for xsimd_compile_for_all_implementations
+macro(_xsimd_compile_one_implementation _srcs _impl)
+ list(FIND _disabled_targets "${_impl}" _disabled_index)
+ list(FIND _only_targets "${_impl}" _only_index)
+ if(${_disabled_index} GREATER -1)
+ if(${_only_index} GREATER -1)
+ # disabled and enabled -> error
+ message(FATAL_ERROR "xsimd_compile_for_all_implementations lists ${_impl} in both the ONLY and EXCLUDE lists. Please remove one.")
+ endif()
+ list(REMOVE_AT _disabled_targets ${_disabled_index})
+ # skip the rest and return
+ elseif((NOT _only_targets AND NOT _state EQUAL 3) OR ${_only_index} GREATER -1)
+ if(${_only_index} GREATER -1)
+ list(REMOVE_AT _only_targets ${_only_index})
+ endif()
+ set(_extra_flags)
+ set(_ok FALSE)
+ foreach(_flags_it ${ARGN})
+ if(_flags_it STREQUAL "NO_FLAG")
+ set(_ok TRUE)
+ break()
+ endif()
+ string(REPLACE " " ";" _flag_list "${_flags_it}")
+ foreach(_f ${_flag_list})
+ AddCompilerFlag(${_f} CXX_RESULT _ok)
+ if(NOT _ok)
+ break()
+ endif()
+ endforeach()
+ if(_ok)
+ set(_extra_flags ${_flags_it})
+ break()
+ endif()
+ endforeach()
+
+ if(MSVC)
+ # MSVC for 64bit does not recognize /arch:SSE2 anymore. Therefore we set override _ok if _impl
+ # says SSE
+ if("${_impl}" MATCHES "SSE")
+ set(_ok TRUE)
+ endif()
+ endif()
+
+ if(_ok)
+ get_filename_component(_out "${__compile_src}" NAME_WE)
+ get_filename_component(_ext "${__compile_src}" EXT)
+ set(_out "${CMAKE_CURRENT_BINARY_DIR}/${_out}_${_impl}${_ext}")
+ add_custom_command(OUTPUT "${_out}"
+ COMMAND ${CMAKE_COMMAND} -E copy "${__compile_src}" "${_out}"
+ DEPENDS "${__compile_src}"
+ COMMENT "Copy to ${_out}"
+ WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+ VERBATIM)
+ set_source_files_properties( "${_out}" PROPERTIES
+ COMPILE_FLAGS "${_flags} ${_extra_flags}"
+ COMPILE_DEFINITIONS "XSIMD_IMPL=${_impl}"
+ )
+ list(APPEND ${_srcs} "${_out}")
+ endif()
+ endif()
+endmacro()
+
+# Generate compile rules for the given C++ source file for all available implementations and return
+# the resulting list of object files in _obj
+# all remaining arguments are additional flags
+# Example:
+# xsimd_compile_for_all_implementations(_objs src/trigonometric.cpp FLAGS -DCOMPILE_BLAH EXCLUDE Scalar)
+# add_executable(executable main.cpp ${_objs})
+macro(xsimd_compile_for_all_implementations _srcs _src)
+ set(_flags)
+ unset(_disabled_targets)
+ unset(_only_targets)
+ set(_state 0)
+ foreach(_arg ${ARGN})
+ if(_arg STREQUAL "FLAGS")
+ set(_state 1)
+ elseif(_arg STREQUAL "EXCLUDE")
+ set(_state 2)
+ elseif(_arg STREQUAL "ONLY")
+ set(_state 3)
+ elseif(_state EQUAL 1)
+ set(_flags "${_flags} ${_arg}")
+ elseif(_state EQUAL 2)
+ list(APPEND _disabled_targets "${_arg}")
+ elseif(_state EQUAL 3)
+ list(APPEND _only_targets "${_arg}")
+ else()
+ message(FATAL_ERROR "incorrect argument to xsimd_compile_for_all_implementations")
+ endif()
+ endforeach()
+
+ set(__compile_src "${_src}")
+
+ ## Note the following settings of default_arch on GCC:
+ ## - fma3<sse> should be -msse -mfma but == fma3<avx>
+ ## - fma3<avx(2)> are -mavx(2) -mfma
+ ## - fma4 should be -mfma4 but == avx
+ ##
+ ## On MSVC:
+ ## - /arch:AVX512 enables all the 512 tandem
+ ##
+ ## To target the individual architectures, it must be
+ ## done manually or via a special definition header.
+
+ ## Note the following for Arm:
+ ## MSVC requires manual patching to detect NEON,
+ ## its intrinsics are available but they are not detectable.
+
+ if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
+ _xsimd_compile_one_implementation(${_srcs} Scalar
+ NO_FLAG)
+ _xsimd_compile_one_implementation(${_srcs} NEON
+ "-mfpu=neon")
+ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
+ _xsimd_compile_one_implementation(${_srcs} Scalar
+ NO_FLAG)
+ _xsimd_compile_one_implementation(${_srcs} NEON64 NO_FLAG)
+ else()
+ _xsimd_compile_one_implementation(${_srcs} Scalar
+ NO_FLAG)
+ _xsimd_compile_one_implementation(${_srcs} SSE2
+ "-msse2" "/arch:SSE2")
+ _xsimd_compile_one_implementation(${_srcs} SSE3
+ "-msse3" "/arch:SSE2")
+ _xsimd_compile_one_implementation(${_srcs} SSSE3
+ "-mssse3" "/arch:SSE2")
+ _xsimd_compile_one_implementation(${_srcs} SSE4_1
+ "-msse4.1" "/arch:SSE2")
+ _xsimd_compile_one_implementation(${_srcs} SSE4_2
+ "-msse4.2" "/arch:SSE2")
+ _xsimd_compile_one_implementation(${_srcs} SSE4_2+FMA
+ "-msse4.2 -mfma" "/arch:AVX")
+ _xsimd_compile_one_implementation(${_srcs} FMA4
+ "-mfma4" "/arch:AVX")
+ _xsimd_compile_one_implementation(${_srcs} AVX
+ "-mavx" "/arch:AVX")
+ _xsimd_compile_one_implementation(${_srcs} AVX+FMA
+ "-mavx -mfma" "/arch:AVX")
+ _xsimd_compile_one_implementation(${_srcs} AVX2
+ "-mavx2" "/arch:AVX2")
+ _xsimd_compile_one_implementation(${_srcs} AVX2+FMA
+ "-mavx2 -mfma" "/arch:AVX2")
+ _xsimd_compile_one_implementation(${_srcs} AVX512F
+ "-mavx512f" "/arch:AVX512")
+ _xsimd_compile_one_implementation(${_srcs} AVX512BW
+ "-mavx512bw" "/arch:AVX512")
+ _xsimd_compile_one_implementation(${_srcs} AVX512CD
+ "-mavx512cd" "/arch:AVX512")
+ _xsimd_compile_one_implementation(${_srcs} AVX512DQ
+ "-mavx512dq" "/arch:AVX512")
+ endif()
+ list(LENGTH _only_targets _len)
+ if(_len GREATER 0)
+ message(WARNING "The following unknown targets where listed in the ONLY list of xsimd_compile_for_all_implementations: '${_only_targets}'")
+ endif()
+ list(LENGTH _disabled_targets _len)
+ if(_len GREATER 0)
+ message(WARNING "The following unknown targets where listed in the EXCLUDE list of xsimd_compile_for_all_implementations: '${_disabled_targets}'")
+ endif()
+endmacro()
diff --git a/config-vc.h.cmake b/config-vc.h.cmake
deleted file mode 100644
index 489dd7d888..0000000000
--- a/config-vc.h.cmake
+++ /dev/null
@@ -1,4 +0,0 @@
-/* config-vc.h. Generated by cmake from config-Vc.h.cmake */
-
-/* Define if you have Vc, the vectorization library */
-#cmakedefine HAVE_VC 1
diff --git a/config-xsimd.h.cmake b/config-xsimd.h.cmake
new file mode 100644
index 0000000000..e4bbda03ea
--- /dev/null
+++ b/config-xsimd.h.cmake
@@ -0,0 +1,4 @@
+/* config-xsimd.h. Generated by cmake from config-xsimd.h.cmake */
+
+/* Define if you have xsimd */
+#cmakedefine HAVE_XSIMD 1
diff --git a/krita/data/aboutdata/libraries.txt b/krita/data/aboutdata/libraries.txt
index b5e1d55599..89bc6ae54b 100644
--- a/krita/data/aboutdata/libraries.txt
+++ b/krita/data/aboutdata/libraries.txt
@@ -29,5 +29,5 @@ Qt,https://www.qt.io,GPLv2 + GPLv3 + LGPLv2.1 + LGPLv3
Quazip,https://github.com/stachenov/quazip,LGPLv2.1
KSeExpr,https://invent.kde.org/graphics/KSeExpr,Apache License Version 2.0
SIP,https://www.riverbankcomputing.com/software/sip/download,GPLv3
-Vc,https://github.com/VcDevel/Vc,BSD
+xsimd,https://github.com/xtensor-stack/xsimd,BSD
zlib,http://www.zlib.net/,BSD
diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt
index fd6fe8afda..2061c76023 100644
--- a/libs/CMakeLists.txt
+++ b/libs/CMakeLists.txt
@@ -22,3 +22,4 @@ add_subdirectory( resources )
add_subdirectory( metadata )
add_subdirectory( resourcewidgets )
add_subdirectory( psd )
+add_subdirectory( multiarch )
diff --git a/libs/brush/CMakeLists.txt b/libs/brush/CMakeLists.txt
index 88050ade93..3b71ef8c26 100644
--- a/libs/brush/CMakeLists.txt
+++ b/libs/brush/CMakeLists.txt
@@ -36,12 +36,7 @@ else ()
target_link_libraries(kritalibbrush kritaimage Qt5::Svg)
endif ()
-
-if(HAVE_VC)
- include_directories(SYSTEM ${Vc_INCLUDE_DIR})
- target_link_libraries(kritalibbrush ${Vc_LIBRARIES})
-# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Vc_DEFINITIONS}")
-endif()
+target_link_libraries(kritalibbrush kritamultiarch)
set_target_properties(kritalibbrush PROPERTIES
VERSION ${GENERIC_KRITA_LIB_VERSION} SOVERSION ${GENERIC_KRITA_LIB_SOVERSION}
diff --git a/libs/brush/tests/CMakeLists.txt b/libs/brush/tests/CMakeLists.txt
index 425015d3eb..49951d6a70 100644
--- a/libs/brush/tests/CMakeLists.txt
+++ b/libs/brush/tests/CMakeLists.txt
@@ -8,12 +8,6 @@ include_directories(SYSTEM
${EIGEN3_INCLUDE_DIR}
)
-if(HAVE_VC)
- include_directories(SYSTEM
- ${Vc_INCLUDE_DIR}
- )
-endif()
-
macro_add_unittest_definitions()
include(ECMAddTests)
diff --git a/libs/image/CMakeLists.txt b/libs/image/CMakeLists.txt
index 35a260f86f..08eae957b2 100644
--- a/libs/image/CMakeLists.txt
+++ b/libs/image/CMakeLists.txt
@@ -25,8 +25,7 @@ if(FFTW3_FOUND)
include_directories(${FFTW3_INCLUDE_DIR})
endif()
-if(HAVE_VC)
- include_directories(SYSTEM ${Vc_INCLUDE_DIR} ${Qt5Core_INCLUDE_DIRS} ${Qt5Gui_INCLUDE_DIRS})
+if(HAVE_XSIMD)
ko_compile_for_all_implementations_no_scalar(__per_arch_circle_mask_generator_objs kis_brush_mask_applicator_factories.cpp)
ko_compile_for_all_implementations_no_scalar(_per_arch_processor_objs kis_brush_mask_processor_factories.cpp)
@@ -391,9 +390,7 @@ if(FFTW3_FOUND)
target_link_libraries(kritaimage PRIVATE ${FFTW3_LIBRARIES})
endif()
-if(HAVE_VC)
- target_link_libraries(kritaimage PUBLIC ${Vc_LIBRARIES})
-endif()
+target_link_libraries(kritaimage PUBLIC kritamultiarch)
if (NOT GSL_FOUND)
message (WARNING "KRITA WARNING! No GNU Scientific Library was found! Krita's Shaped Gradients might be non-normalized! Please install GSL library.")
diff --git a/libs/image/kis_antialiasing_fade_maker.h b/libs/image/kis_antialiasing_fade_maker.h
index e8bb3c887e..e25cf07e1c 100644
--- a/libs/image/kis_antialiasing_fade_maker.h
+++ b/libs/image/kis_antialiasing_fade_maker.h
@@ -1,5 +1,6 @@
/*
* SPDX-FileCopyrightText: 2014 Dmitry Kazakov <dimula73 at gmail.com>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
@@ -7,33 +8,36 @@
#ifndef __KIS_ANTIALIASING_FADE_MAKER_H
#define __KIS_ANTIALIASING_FADE_MAKER_H
-#include "kis_global.h"
+#include <kis_global.h>
-template <class BaseFade>
+#include <xsimd_extensions/xsimd.hpp>
+
+template<class BaseFade>
class KisAntialiasingFadeMaker1D
{
public:
KisAntialiasingFadeMaker1D(const BaseFade &baseFade, bool enableAntialiasing)
- : m_radius(0.0),
- m_fadeStartValue(0),
- m_antialiasingFadeStart(0),
- m_antialiasingFadeCoeff(0),
- m_enableAntialiasing(enableAntialiasing),
- m_baseFade(baseFade)
+ : m_radius(0.0)
+ , m_fadeStartValue(0)
+ , m_antialiasingFadeStart(0)
+ , m_antialiasingFadeCoeff(0)
+ , m_enableAntialiasing(enableAntialiasing)
+ , m_baseFade(baseFade)
{
}
KisAntialiasingFadeMaker1D(const KisAntialiasingFadeMaker1D &rhs, const BaseFade &baseFade)
- : m_radius(rhs.m_radius),
- m_fadeStartValue(rhs.m_fadeStartValue),
- m_antialiasingFadeStart(rhs.m_antialiasingFadeStart),
- m_antialiasingFadeCoeff(rhs.m_antialiasingFadeCoeff),
- m_enableAntialiasing(rhs.m_enableAntialiasing),
- m_baseFade(baseFade)
+ : m_radius(rhs.m_radius)
+ , m_fadeStartValue(rhs.m_fadeStartValue)
+ , m_antialiasingFadeStart(rhs.m_antialiasingFadeStart)
+ , m_antialiasingFadeCoeff(rhs.m_antialiasingFadeCoeff)
+ , m_enableAntialiasing(rhs.m_enableAntialiasing)
+ , m_baseFade(baseFade)
{
}
- void setSquareNormCoeffs(qreal xcoeff, qreal ycoeff) {
+ void setSquareNormCoeffs(qreal xcoeff, qreal ycoeff)
+ {
m_radius = 1.0;
qreal xf = qMax(0.0, ((1.0 / xcoeff) - 1.0) * xcoeff);
@@ -45,7 +49,8 @@ public:
m_antialiasingFadeCoeff = qMax(0.0, 255.0 - m_fadeStartValue) / (m_radius - m_antialiasingFadeStart);
}
- void setRadius(qreal radius) {
+ void setRadius(qreal radius)
+ {
m_radius = radius;
m_antialiasingFadeStart = qMax(0.0, m_radius - 1.0);
@@ -53,7 +58,8 @@ public:
m_antialiasingFadeCoeff = qMax(0.0, 255.0 - m_fadeStartValue) / (m_radius - m_antialiasingFadeStart);
}
- inline bool needFade(qreal dist, quint8 *value) {
+ inline bool needFade(qreal dist, quint8 *value)
+ {
if (dist > m_radius) {
*value = 255;
return true;
@@ -71,30 +77,36 @@ public:
return false;
}
-#if defined HAVE_VC
- Vc::float_m needFade(Vc::float_v &dist) {
- const Vc::float_v vOne(Vc::One);
- const Vc::float_v vValMax(255.f);
+#if defined HAVE_XSIMD
+ template<typename A>
+ xsimd::batch_bool<float, A> needFade(xsimd::batch<float, A> &dist)
+ {
+ using float_v = xsimd::batch<float, A>;
+ using float_m = typename float_v::batch_bool_type;
+
+ const float_v vOne(1);
+ const float_v vValMax(255.f);
- Vc::float_v vRadius(m_radius);
- Vc::float_v vFadeStartValue(m_fadeStartValue);
- Vc::float_v vAntialiasingFadeStart(m_antialiasingFadeStart);
- Vc::float_v vAntialiasingFadeCoeff(m_antialiasingFadeCoeff);
+ float_v vRadius(m_radius);
+ float_v vFadeStartValue(m_fadeStartValue);
+ float_v vAntialiasingFadeStart(m_antialiasingFadeStart);
+ float_v vAntialiasingFadeCoeff(m_antialiasingFadeCoeff);
- Vc::float_m outsideMask = dist > vRadius;
- dist(outsideMask) = vOne;
+ float_m outsideMask = dist > vRadius;
+ dist = xsimd::set_one(dist, outsideMask);
- Vc::float_m fadeStartMask(false);
+ float_m fadeStartMask(false);
- if(m_enableAntialiasing){
+ if (m_enableAntialiasing) {
fadeStartMask = dist > vAntialiasingFadeStart;
- dist((outsideMask ^ fadeStartMask) & fadeStartMask) = (vFadeStartValue +
- (dist - vAntialiasingFadeStart) * vAntialiasingFadeCoeff) / vValMax;
+ dist = xsimd::select((outsideMask ^ fadeStartMask) & fadeStartMask,
+ (vFadeStartValue + (dist - vAntialiasingFadeStart) * vAntialiasingFadeCoeff) / vValMax,
+ dist);
}
return (outsideMask | fadeStartMask);
}
-#endif /* defined HAVE_VC */
+#endif /* defined HAVE_XSIMD */
private:
qreal m_radius;
@@ -105,35 +117,36 @@ private:
const BaseFade &m_baseFade;
};
-template <class BaseFade>
+template<class BaseFade>
class KisAntialiasingFadeMaker2D
{
public:
KisAntialiasingFadeMaker2D(const BaseFade &baseFade, bool enableAntialiasing)
- : m_xLimit(0),
- m_yLimit(0),
- m_xFadeLimitStart(0),
- m_yFadeLimitStart(0),
- m_xFadeCoeff(0),
- m_yFadeCoeff(0),
- m_enableAntialiasing(enableAntialiasing),
- m_baseFade(baseFade)
+ : m_xLimit(0)
+ , m_yLimit(0)
+ , m_xFadeLimitStart(0)
+ , m_yFadeLimitStart(0)
+ , m_xFadeCoeff(0)
+ , m_yFadeCoeff(0)
+ , m_enableAntialiasing(enableAntialiasing)
+ , m_baseFade(baseFade)
{
}
KisAntialiasingFadeMaker2D(const KisAntialiasingFadeMaker2D &rhs, const BaseFade &baseFade)
- : m_xLimit(rhs.m_xLimit),
- m_yLimit(rhs.m_yLimit),
- m_xFadeLimitStart(rhs.m_xFadeLimitStart),
- m_yFadeLimitStart(rhs.m_yFadeLimitStart),
- m_xFadeCoeff(rhs.m_xFadeCoeff),
- m_yFadeCoeff(rhs.m_yFadeCoeff),
- m_enableAntialiasing(rhs.m_enableAntialiasing),
- m_baseFade(baseFade)
+ : m_xLimit(rhs.m_xLimit)
+ , m_yLimit(rhs.m_yLimit)
+ , m_xFadeLimitStart(rhs.m_xFadeLimitStart)
+ , m_yFadeLimitStart(rhs.m_yFadeLimitStart)
+ , m_xFadeCoeff(rhs.m_xFadeCoeff)
+ , m_yFadeCoeff(rhs.m_yFadeCoeff)
+ , m_enableAntialiasing(rhs.m_enableAntialiasing)
+ , m_baseFade(baseFade)
{
}
- void setLimits(qreal halfWidth, qreal halfHeight) {
+ void setLimits(qreal halfWidth, qreal halfHeight)
+ {
m_xLimit = halfWidth;
m_yLimit = halfHeight;
@@ -144,7 +157,8 @@ public:
m_yFadeCoeff = 1.0 / (m_yLimit - m_yFadeLimitStart);
}
- inline bool needFade(qreal x, qreal y, quint8 *value) {
+ inline bool needFade(qreal x, qreal y, quint8 *value)
+ {
x = qAbs(x);
y = qAbs(y);
@@ -187,57 +201,67 @@ public:
return false;
}
-#if defined HAVE_VC
- Vc::float_m needFade(Vc::float_v &xr, Vc::float_v &yr) const {
+#if defined HAVE_XSIMD
+ template<typename A>
+ xsimd::batch_bool<float, A> needFade(xsimd::batch<float, A> &xr, xsimd::batch<float, A> &yr) const
+ {
+ using float_v = xsimd::batch<float, A>;
+ using float_m = typename float_v::batch_bool_type;
- Vc::float_v vXLimit(m_xLimit);
- Vc::float_v vYLimit(m_yLimit);
+ float_v vXLimit(m_xLimit);
+ float_v vYLimit(m_yLimit);
- Vc::float_m outXMask = Vc::abs(xr) > vXLimit;
- Vc::float_m outYMask = Vc::abs(yr) > vYLimit;
+ float_m outXMask = xsimd::abs(xr) > vXLimit;
+ float_m outYMask = xsimd::abs(yr) > vYLimit;
return (outXMask | outYMask);
}
// Apply fader separately to avoid calculating vValue twice.
- void apply2DFader(Vc::float_v &vValue, Vc::float_m &excludeMask, Vc::float_v &xr, Vc::float_v &yr) const {
- const Vc::float_v vValMax(255.f);
+ template<typename A>
+ void apply2DFader(xsimd::batch<float, A> &vValue, xsimd::batch_bool<float, A> &excludeMask, xsimd::batch<float, A> &xr, xsimd::batch<float, A> &yr) const
+ {
+ using float_v = xsimd::batch<float, A>;
+ using float_m = typename float_v::batch_bool_type;
+
+ const float_v vValMax(255.f);
- if(m_enableAntialiasing){
- Vc::float_v vXFadeLimitStart(m_xFadeLimitStart);
- Vc::float_v vYFadeLimitStart(m_yFadeLimitStart);
- Vc::float_v vXFadeCoeff(m_xFadeCoeff);
- Vc::float_v vYFadeCoeff(m_yFadeCoeff);
+ if (m_enableAntialiasing) {
+ float_v vXFadeLimitStart(m_xFadeLimitStart);
+ float_v vYFadeLimitStart(m_yFadeLimitStart);
+ float_v vXFadeCoeff(m_xFadeCoeff);
+ float_v vYFadeCoeff(m_yFadeCoeff);
- Vc::float_v xra = abs(xr);
- Vc::float_m fadeXStartMask(false);
- Vc::float_m fadeYStartMask(false);
+ float_v xra = xsimd::abs(xr);
+ float_m fadeXStartMask(false);
+ float_m fadeYStartMask(false);
- Vc::float_v fadeValue;
- Vc::SimdArray<quint16,Vc::float_v::size()> vBaseValue(vValue);
+ float_v fadeValue{};
+ float_v vBaseValue = xsimd::truncate_to_type<uint16_t>(vValue);
fadeXStartMask = xra > vXFadeLimitStart;
fadeXStartMask = (fadeXStartMask ^ excludeMask) & fadeXStartMask;
- if (!fadeXStartMask.isFull()) {
+ if (!xsimd::all(fadeXStartMask)) {
fadeValue = vBaseValue + (vValMax - vBaseValue) * (xra - vXFadeLimitStart) * vXFadeCoeff;
- fadeValue(fadeXStartMask & ((yr > vYFadeLimitStart) & (fadeValue < vValMax)) ) =
- fadeValue + (vValMax - fadeValue) * (yr - vYFadeLimitStart) * vYFadeCoeff;
- vValue(fadeXStartMask) = fadeValue;
+ fadeValue = xsimd::select(fadeXStartMask & ((yr > vYFadeLimitStart) & (fadeValue < vValMax)),
+ fadeValue + (vValMax - fadeValue) * (yr - vYFadeLimitStart) * vYFadeCoeff,
+ fadeValue);
+ vValue = xsimd::select(fadeXStartMask, fadeValue, vValue);
}
fadeYStartMask = yr > vYFadeLimitStart;
fadeYStartMask = (fadeYStartMask ^ fadeXStartMask) & fadeYStartMask;
- if (!fadeYStartMask.isFull()) {
+ if (!xsimd::all(fadeYStartMask)) {
fadeValue = vBaseValue + (vValMax - vBaseValue) * (yr - vYFadeLimitStart) * vYFadeCoeff;
- fadeValue(fadeYStartMask & ((xra > vXFadeLimitStart) & (fadeValue < vValMax)) ) =
- fadeValue + (vValMax - fadeValue) * (xra - vXFadeLimitStart) * vXFadeCoeff;
- vValue(fadeYStartMask) = fadeValue;
+ fadeValue = xsimd::select(fadeYStartMask & ((xra > vXFadeLimitStart) & (fadeValue < vValMax)),
+ fadeValue + (vValMax - fadeValue) * (xra - vXFadeLimitStart) * vXFadeCoeff,
+ fadeValue);
+ vValue = xsimd::select(fadeYStartMask, fadeValue, vValue);
}
}
- return;
}
-#endif /* defined HAVE_VC */
+#endif /* defined HAVE_XSIMD */
private:
qreal m_xLimit;
diff --git a/libs/image/kis_base_mask_generator.cpp b/libs/image/kis_base_mask_generator.cpp
index 94a3f755a2..ebad368e5b 100644
--- a/libs/image/kis_base_mask_generator.cpp
+++ b/libs/image/kis_base_mask_generator.cpp
@@ -7,7 +7,7 @@
* SPDX-License-Identifier: GPL-2.0-or-later
*/
-#include <compositeops/KoVcMultiArchBuildSupport.h> //vc.h must come first
+#include <compositeops/KoMultiArchBuildSupport.h>
#include "kis_brush_mask_applicator_factories.h"
#include "kis_mask_generator.h"
diff --git a/libs/image/kis_brush_mask_applicator_factories.cpp b/libs/image/kis_brush_mask_applicator_factories.cpp
index 105d8577da..224b7703fc 100644
--- a/libs/image/kis_brush_mask_applicator_factories.cpp
+++ b/libs/image/kis_brush_mask_applicator_factories.cpp
@@ -14,61 +14,60 @@
#include "kis_curve_rect_mask_generator.h"
#include "kis_rect_mask_generator.h"
-#include "kis_brush_mask_applicator_base.h"
#include "kis_brush_mask_vector_applicator.h"
template<>
template<>
MaskApplicatorFactory<KisMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisMaskGenerator>::create<Vc::CurrentImplementation::current()>(ParamType maskGenerator)
+MaskApplicatorFactory<KisMaskGenerator>::create<xsimd::current_arch>(ParamType maskGenerator)
{
- return new KisBrushMaskScalarApplicator<KisMaskGenerator, Vc::CurrentImplementation::current()>(maskGenerator);
+ return new KisBrushMaskScalarApplicator<KisMaskGenerator,xsimd::current_arch>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisCircleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisCircleMaskGenerator>::create<Vc::CurrentImplementation::current()>(ParamType maskGenerator)
+MaskApplicatorFactory<KisCircleMaskGenerator>::create<xsimd::current_arch>(ParamType maskGenerator)
{
- return new KisBrushMaskVectorApplicator<KisCircleMaskGenerator, Vc::CurrentImplementation::current()>(maskGenerator);
+ return new KisBrushMaskVectorApplicator<KisCircleMaskGenerator,xsimd::current_arch>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisGaussCircleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisGaussCircleMaskGenerator>::create<Vc::CurrentImplementation::current()>(ParamType maskGenerator)
+MaskApplicatorFactory<KisGaussCircleMaskGenerator>::create<xsimd::current_arch>(ParamType maskGenerator)
{
- return new KisBrushMaskVectorApplicator<KisGaussCircleMaskGenerator, Vc::CurrentImplementation::current()>(maskGenerator);
+ return new KisBrushMaskVectorApplicator<KisGaussCircleMaskGenerator, xsimd::current_arch>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisCurveCircleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisCurveCircleMaskGenerator>::create<Vc::CurrentImplementation::current()>(ParamType maskGenerator)
+MaskApplicatorFactory<KisCurveCircleMaskGenerator>::create<xsimd::current_arch>(ParamType maskGenerator)
{
- return new KisBrushMaskVectorApplicator<KisCurveCircleMaskGenerator, Vc::CurrentImplementation::current()>(maskGenerator);
+ return new KisBrushMaskVectorApplicator<KisCurveCircleMaskGenerator,xsimd::current_arch>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisRectangleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisRectangleMaskGenerator>::create<Vc::CurrentImplementation::current()>(ParamType maskGenerator)
+MaskApplicatorFactory<KisRectangleMaskGenerator>::create<xsimd::current_arch>(ParamType maskGenerator)
{
- return new KisBrushMaskVectorApplicator<KisRectangleMaskGenerator, Vc::CurrentImplementation::current()>(maskGenerator);
+ return new KisBrushMaskVectorApplicator<KisRectangleMaskGenerator,xsimd::current_arch>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisGaussRectangleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisGaussRectangleMaskGenerator>::create<Vc::CurrentImplementation::current()>(ParamType maskGenerator)
+MaskApplicatorFactory<KisGaussRectangleMaskGenerator>::create<xsimd::current_arch>(ParamType maskGenerator)
{
- return new KisBrushMaskVectorApplicator<KisGaussRectangleMaskGenerator, Vc::CurrentImplementation::current()>(maskGenerator);
+ return new KisBrushMaskVectorApplicator<KisGaussRectangleMaskGenerator,xsimd::current_arch>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisCurveRectangleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisCurveRectangleMaskGenerator>::create<Vc::CurrentImplementation::current()>(ParamType maskGenerator)
+MaskApplicatorFactory<KisCurveRectangleMaskGenerator>::create<xsimd::current_arch>(ParamType maskGenerator)
{
- return new KisBrushMaskVectorApplicator<KisCurveRectangleMaskGenerator, Vc::CurrentImplementation::current()>(maskGenerator);
+ return new KisBrushMaskVectorApplicator<KisCurveRectangleMaskGenerator,xsimd::current_arch>(maskGenerator);
}
diff --git a/libs/image/kis_brush_mask_applicator_factories.h b/libs/image/kis_brush_mask_applicator_factories.h
index 60592e4516..8705ea9fa3 100644
--- a/libs/image/kis_brush_mask_applicator_factories.h
+++ b/libs/image/kis_brush_mask_applicator_factories.h
@@ -8,16 +8,17 @@
#ifndef __KIS_BRUSH_MASK_APPLICATOR_FACTORIES_H
#define __KIS_BRUSH_MASK_APPLICATOR_FACTORIES_H
-#include <compositeops/KoVcMultiArchBuildSupport.h>
+#include <compositeops/KoMultiArchBuildSupport.h>
class KisBrushMaskApplicatorBase;
template<class MaskGenerator>
-struct MaskApplicatorFactory {
+struct MaskApplicatorFactory
+{
using ParamType = MaskGenerator *;
using ReturnType = KisBrushMaskApplicatorBase *;
- template<Vc::Implementation _impl>
+ template<typename _impl>
static ReturnType create(ParamType maskGenerator);
};
diff --git a/libs/image/kis_brush_mask_applicator_factories_Scalar.cpp b/libs/image/kis_brush_mask_applicator_factories_Scalar.cpp
index 6cf142561f..a7d6816bdf 100644
--- a/libs/image/kis_brush_mask_applicator_factories_Scalar.cpp
+++ b/libs/image/kis_brush_mask_applicator_factories_Scalar.cpp
@@ -19,55 +19,55 @@
template<>
template<>
MaskApplicatorFactory<KisMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisMaskGenerator>::create<Vc::ScalarImpl>(ParamType maskGenerator)
+MaskApplicatorFactory<KisMaskGenerator>::create<xsimd::generic>(ParamType maskGenerator)
{
- return new KisBrushMaskScalarApplicator<KisMaskGenerator, Vc::ScalarImpl>(maskGenerator);
+ return new KisBrushMaskScalarApplicator<KisMaskGenerator, xsimd::generic>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisCircleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisCircleMaskGenerator>::create<Vc::ScalarImpl>(ParamType maskGenerator)
+MaskApplicatorFactory<KisCircleMaskGenerator>::create<xsimd::generic>(ParamType maskGenerator)
{
- return new KisBrushMaskScalarApplicator<KisCircleMaskGenerator, Vc::ScalarImpl>(maskGenerator);
+ return new KisBrushMaskScalarApplicator<KisCircleMaskGenerator, xsimd::generic>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisGaussCircleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisGaussCircleMaskGenerator>::create<Vc::ScalarImpl>(ParamType maskGenerator)
+MaskApplicatorFactory<KisGaussCircleMaskGenerator>::create<xsimd::generic>(ParamType maskGenerator)
{
- return new KisBrushMaskScalarApplicator<KisGaussCircleMaskGenerator, Vc::ScalarImpl>(maskGenerator);
+ return new KisBrushMaskScalarApplicator<KisGaussCircleMaskGenerator, xsimd::generic>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisCurveCircleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisCurveCircleMaskGenerator>::create<Vc::ScalarImpl>(ParamType maskGenerator)
+MaskApplicatorFactory<KisCurveCircleMaskGenerator>::create<xsimd::generic>(ParamType maskGenerator)
{
- return new KisBrushMaskScalarApplicator<KisCurveCircleMaskGenerator, Vc::ScalarImpl>(maskGenerator);
+ return new KisBrushMaskScalarApplicator<KisCurveCircleMaskGenerator, xsimd::generic>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisRectangleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisRectangleMaskGenerator>::create<Vc::ScalarImpl>(ParamType maskGenerator)
+MaskApplicatorFactory<KisRectangleMaskGenerator>::create<xsimd::generic>(ParamType maskGenerator)
{
- return new KisBrushMaskScalarApplicator<KisRectangleMaskGenerator, Vc::ScalarImpl>(maskGenerator);
+ return new KisBrushMaskScalarApplicator<KisRectangleMaskGenerator, xsimd::generic>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisGaussRectangleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisGaussRectangleMaskGenerator>::create<Vc::ScalarImpl>(ParamType maskGenerator)
+MaskApplicatorFactory<KisGaussRectangleMaskGenerator>::create<xsimd::generic>(ParamType maskGenerator)
{
- return new KisBrushMaskScalarApplicator<KisGaussRectangleMaskGenerator, Vc::ScalarImpl>(maskGenerator);
+ return new KisBrushMaskScalarApplicator<KisGaussRectangleMaskGenerator, xsimd::generic>(maskGenerator);
}
template<>
template<>
MaskApplicatorFactory<KisCurveRectangleMaskGenerator>::ReturnType
-MaskApplicatorFactory<KisCurveRectangleMaskGenerator>::create<Vc::ScalarImpl>(ParamType maskGenerator)
+MaskApplicatorFactory<KisCurveRectangleMaskGenerator>::create<xsimd::generic>(ParamType maskGenerator)
{
- return new KisBrushMaskScalarApplicator<KisCurveRectangleMaskGenerator, Vc::ScalarImpl>(maskGenerator);
+ return new KisBrushMaskScalarApplicator<KisCurveRectangleMaskGenerator, xsimd::generic>(maskGenerator);
}
diff --git a/libs/image/kis_brush_mask_processor_factories.cpp b/libs/image/kis_brush_mask_processor_factories.cpp
index 3989df79ce..48efd53501 100644
--- a/libs/image/kis_brush_mask_processor_factories.cpp
+++ b/libs/image/kis_brush_mask_processor_factories.cpp
@@ -5,9 +5,9 @@
* SPDX-License-Identifier: GPL-2.0-or-later
*/
-#include <compositeops/KoVcMultiArchBuildSupport.h>
+#include "xsimd_extensions/xsimd.hpp"
-#if defined HAVE_VC
+#if defined HAVE_XSIMD
#include "kis_circle_mask_generator.h"
#include "kis_circle_mask_generator_p.h"
@@ -32,14 +32,17 @@
template<>
template<>
-void FastRowProcessor<KisCircleMaskGenerator>::process<Vc::CurrentImplementation::current()>(float *buffer,
- int width,
- float y,
- float cosa,
- float sina,
- float centerX,
- float centerY)
+void FastRowProcessor<KisCircleMaskGenerator>::process<xsimd::current_arch>(float *buffer,
+ int width,
+ float y,
+ float cosa,
+ float sina,
+ float centerX,
+ float centerY)
{
+ using float_v = xsimd::batch<float, xsimd::current_arch>;
+ using float_m = typename float_v::batch_bool_type;
+
const bool useSmoothing = d->copyOfAntialiasEdges;
float y_ = y - centerY;
@@ -48,150 +51,156 @@ void FastRowProcessor<KisCircleMaskGenerator>::process<Vc::CurrentImplementation
float *bufferPointer = buffer;
- Vc::float_v currentIndices = Vc::float_v::IndexesFromZero();
+ float_v currentIndices = xsimd::detail::make_sequence_as_batch<float_v>();
- Vc::float_v increment((float)Vc::float_v::size());
- Vc::float_v vCenterX(centerX);
+ float_v increment((float)float_v::size);
+ float_v vCenterX(centerX);
- Vc::float_v vCosa(cosa);
- Vc::float_v vSina(sina);
- Vc::float_v vCosaY_(cosay_);
- Vc::float_v vSinaY_(sinay_);
+ float_v vCosa(cosa);
+ float_v vSina(sina);
+ float_v vCosaY_(cosay_);
+ float_v vSinaY_(sinay_);
- Vc::float_v vXCoeff(static_cast<float>(d->xcoef));
- Vc::float_v vYCoeff(static_cast<float>(d->ycoef));
+ float_v vXCoeff(static_cast<float>(d->xcoef));
+ float_v vYCoeff(static_cast<float>(d->ycoef));
- Vc::float_v vTransformedFadeX(static_cast<float>(d->transformedFadeX));
- Vc::float_v vTransformedFadeY(static_cast<float>(d->transformedFadeY));
+ float_v vTransformedFadeX(static_cast<float>(d->transformedFadeX));
+ float_v vTransformedFadeY(static_cast<float>(d->transformedFadeY));
- Vc::float_v vOne(Vc::One);
+ float_v vOne(1);
- for (size_t i = 0; i < static_cast<size_t>(width); i += Vc::float_v::size()) {
- Vc::float_v x_ = currentIndices - vCenterX;
+ for (size_t i = 0; i < static_cast<size_t>(width); i += float_v::size) {
+ float_v x_ = currentIndices - vCenterX;
- Vc::float_v xr = x_ * vCosa - vSinaY_;
- Vc::float_v yr = x_ * vSina + vCosaY_;
+ float_v xr = x_ * vCosa - vSinaY_;
+ float_v yr = x_ * vSina + vCosaY_;
- Vc::float_v n = pow2(xr * vXCoeff) + pow2(yr * vYCoeff);
- Vc::float_m outsideMask = n > vOne;
+ float_v n = xsimd::pow2(xr * vXCoeff) + xsimd::pow2(yr * vYCoeff);
+ float_m outsideMask = n > vOne;
- if (!outsideMask.isFull()) {
+ if (!xsimd::all(outsideMask)) {
if (useSmoothing) {
- xr = Vc::abs(xr) + vOne;
- yr = Vc::abs(yr) + vOne;
+ xr = xsimd::abs(xr) + vOne;
+ yr = xsimd::abs(yr) + vOne;
}
- Vc::float_v vNormFade = pow2(xr * vTransformedFadeX) + pow2(yr * vTransformedFadeY);
- Vc::float_m vNormLowMask = vNormFade < vOne;
- vNormFade.setZero(vNormLowMask);
+ float_v vNormFade = xsimd::pow2(xr * vTransformedFadeX) + xsimd::pow2(yr * vTransformedFadeY);
+ float_m vNormLowMask = vNormFade < vOne;
+ vNormFade = xsimd::set_zero(vNormFade, vNormLowMask);
// 255 * n * (normeFade - 1) / (normeFade - n)
- Vc::float_v vFade = n * (vNormFade - vOne) / (vNormFade - n);
+ float_v vFade = n * (vNormFade - vOne) / (vNormFade - n);
// Mask in the inner circle of the mask
- Vc::float_m mask = vNormFade < vOne;
- vFade.setZero(mask);
+ float_m mask = vNormFade < vOne;
+ vFade = xsimd::set_zero(vFade, mask);
// Mask out the outer circle of the mask
- vFade(outsideMask) = vOne;
+ vFade = xsimd::set_one(vFade, outsideMask);
- vFade.store(bufferPointer, Vc::Aligned);
+ vFade.store_aligned(bufferPointer);
} else {
// Mask out everything outside the circle
- vOne.store(bufferPointer, Vc::Aligned);
+ vOne.store_aligned(bufferPointer);
}
currentIndices = currentIndices + increment;
- bufferPointer += Vc::float_v::size();
+ bufferPointer += float_v::size;
}
}
template<>
template<>
-void FastRowProcessor<KisGaussCircleMaskGenerator>::process<Vc::CurrentImplementation::current()>(float *buffer,
- int width,
- float y,
- float cosa,
- float sina,
- float centerX,
- float centerY)
+void FastRowProcessor<KisGaussCircleMaskGenerator>::process<xsimd::current_arch>(float *buffer,
+ int width,
+ float y,
+ float cosa,
+ float sina,
+ float centerX,
+ float centerY)
{
+ using float_v = xsimd::batch<float, xsimd::current_arch>;
+ using float_m = float_v::batch_bool_type;
+
float y_ = y - centerY;
float sinay_ = sina * y_;
float cosay_ = cosa * y_;
float *bufferPointer = buffer;
- Vc::float_v currentIndices = Vc::float_v::IndexesFromZero();
+ float_v currentIndices = xsimd::detail::make_sequence_as_batch<float_v>();
- Vc::float_v increment(static_cast<float>(Vc::float_v::size()));
- Vc::float_v vCenterX(centerX);
- Vc::float_v vCenter(static_cast<float>(d->center));
+ float_v increment(static_cast<float>(float_v::size));
+ float_v vCenterX(centerX);
+ float_v vCenter(static_cast<float>(d->center));
- Vc::float_v vCosa(cosa);
- Vc::float_v vSina(sina);
- Vc::float_v vCosaY_(cosay_);
- Vc::float_v vSinaY_(sinay_);
+ float_v vCosa(cosa);
+ float_v vSina(sina);
+ float_v vCosaY_(cosay_);
+ float_v vSinaY_(sinay_);
- Vc::float_v vYCoeff(static_cast<float>(d->ycoef));
- Vc::float_v vDistfactor(static_cast<float>(d->distfactor));
- Vc::float_v vAlphafactor(static_cast<float>(d->alphafactor));
+ float_v vYCoeff(static_cast<float>(d->ycoef));
+ float_v vDistfactor(static_cast<float>(d->distfactor));
+ float_v vAlphafactor(static_cast<float>(d->alphafactor));
- Vc::float_v vZero(Vc::Zero);
- Vc::float_v vValMax(255.f);
+ float_v vZero(0);
+ float_v vValMax(255.f);
- for (size_t i = 0; i < static_cast<size_t>(width); i += Vc::float_v::size()) {
- Vc::float_v x_ = currentIndices - vCenterX;
+ for (size_t i = 0; i < static_cast<size_t>(width); i += float_v::size) {
+ float_v x_ = currentIndices - vCenterX;
- Vc::float_v xr = x_ * vCosa - vSinaY_;
- Vc::float_v yr = x_ * vSina + vCosaY_;
+ float_v xr = x_ * vCosa - vSinaY_;
+ float_v yr = x_ * vSina + vCosaY_;
- Vc::float_v dist = sqrt(pow2(xr) + pow2(yr * vYCoeff));
+ float_v dist = xsimd::sqrt(xsimd::pow2(xr) + xsimd::pow2(yr * vYCoeff));
// Apply FadeMaker mask and operations
- Vc::float_m excludeMask = d->fadeMaker.needFade(dist);
+ float_m excludeMask = d->fadeMaker.needFade(dist);
- if (!excludeMask.isFull()) {
- Vc::float_v valDist = dist * vDistfactor;
- Vc::float_v fullFade =
- vAlphafactor * (VcExtraMath::erf(valDist + vCenter) - VcExtraMath::erf(valDist - vCenter));
+ if (!xsimd::all(excludeMask)) {
+ float_v valDist = dist * vDistfactor;
+ float_v fullFade = vAlphafactor * (VcExtraMath::erf(valDist + vCenter) - VcExtraMath::erf(valDist - vCenter));
- Vc::float_m mask;
+ float_m mask;
// Mask in the inner circle of the mask
mask = fullFade < vZero;
- fullFade.setZero(mask);
+ fullFade = xsimd::set_zero(fullFade, mask);
// Mask the outer circle
mask = fullFade > 254.974f;
- fullFade(mask) = vValMax;
+ fullFade = xsimd::select(mask, vValMax, fullFade);
// Mask (value - value), precision errors.
- Vc::float_v vFade = (vValMax - fullFade) / vValMax;
+ float_v vFade = (vValMax - fullFade) / vValMax;
// return original dist values before vFade transform
- vFade(excludeMask) = dist;
- vFade.store(bufferPointer, Vc::Aligned);
+ vFade = xsimd::select(excludeMask, dist, vFade);
+ vFade.store_aligned(bufferPointer);
} else {
- dist.store(bufferPointer, Vc::Aligned);
+ dist.store_aligned(bufferPointer);
}
currentIndices = currentIndices + increment;
- bufferPointer += Vc::float_v::size();
+ bufferPointer += float_v::size;
}
}
template<>
template<>
-void FastRowProcessor<KisCurveCircleMaskGenerator>::process<Vc::CurrentImplementation::current()>(float *buffer,
- int width,
- float y,
- float cosa,
- float sina,
- float centerX,
- float centerY)
+void FastRowProcessor<KisCurveCircleMaskGenerator>::process<xsimd::current_arch>(float *buffer,
+ int width,
+ float y,
+ float cosa,
+ float sina,
+ float centerX,
+ float centerY)
{
+ using int_v = xsimd::batch<int, xsimd::current_arch>;
+ using float_v = xsimd::batch<float, xsimd::current_arch>;
+ using float_m = float_v::batch_bool_type;
+
float y_ = y - centerY;
float sinay_ = sina * y_;
float cosay_ = cosa * y_;
@@ -200,88 +209,89 @@ void FastRowProcessor<KisCurveCircleMaskGenerator>::process<Vc::CurrentImplement
qreal *curveDataPointer = d->curveData.data();
- Vc::float_v currentIndices = Vc::float_v::IndexesFromZero();
+ float_v currentIndices = xsimd::detail::make_sequence_as_batch<float_v>();
- Vc::float_v increment((float)Vc::float_v::size());
- Vc::float_v vCenterX(centerX);
+ float_v increment((float)float_v::size);
+ float_v vCenterX(centerX);
- Vc::float_v vCosa(cosa);
- Vc::float_v vSina(sina);
- Vc::float_v vCosaY_(cosay_);
- Vc::float_v vSinaY_(sinay_);
+ float_v vCosa(cosa);
+ float_v vSina(sina);
+ float_v vCosaY_(cosay_);
+ float_v vSinaY_(sinay_);
- Vc::float_v vYCoeff(static_cast<float>(d->ycoef));
- Vc::float_v vXCoeff(static_cast<float>(d->xcoef));
- Vc::float_v vCurveResolution(static_cast<float>(d->curveResolution));
+ float_v vYCoeff(static_cast<float>(d->ycoef));
+ float_v vXCoeff(static_cast<float>(d->xcoef));
+ float_v vCurveResolution(static_cast<float>(d->curveResolution));
- Vc::float_v vCurvedData(Vc::Zero);
- Vc::float_v vCurvedData1(Vc::Zero);
+ float_v vCurvedData(0);
+ float_v vCurvedData1(0);
- Vc::float_v vOne(Vc::One);
- Vc::float_v vZero(Vc::Zero);
+ float_v vOne(1);
+ float_v vZero(0);
- for (size_t i = 0; i < static_cast<size_t>(width); i += Vc::float_v::size()) {
- Vc::float_v x_ = currentIndices - vCenterX;
+ for (size_t i = 0; i < static_cast<size_t>(width); i += float_v::size) {
+ float_v x_ = currentIndices - vCenterX;
- Vc::float_v xr = x_ * vCosa - vSinaY_;
- Vc::float_v yr = x_ * vSina + vCosaY_;
+ float_v xr = x_ * vCosa - vSinaY_;
+ float_v yr = x_ * vSina + vCosaY_;
- Vc::float_v dist = pow2(xr * vXCoeff) + pow2(yr * vYCoeff);
+ float_v dist = xsimd::pow2(xr * vXCoeff) + xsimd::pow2(yr * vYCoeff);
// Apply FadeMaker mask and operations
- Vc::float_m excludeMask = d->fadeMaker.needFade(dist);
+ float_m excludeMask = d->fadeMaker.needFade(dist);
- if (!excludeMask.isFull()) {
- Vc::float_v valDist = dist * vCurveResolution;
+ if (!xsimd::all(excludeMask)) {
+ float_v valDist = dist * vCurveResolution;
// truncate
- Vc::float_v::IndexType vAlphaValue(valDist);
- Vc::float_v vFloatAlphaValue = vAlphaValue;
+ int_v vAlphaValue = xsimd::to_int(valDist);
+ float_v vFloatAlphaValue = xsimd::to_float(vAlphaValue);
- Vc::float_v alphaValueF = valDist - vFloatAlphaValue;
+ float_v alphaValueF = valDist - vFloatAlphaValue;
- Vc::float_m alphaMask = vAlphaValue < vZero;
- vAlphaValue.setZero(alphaMask);
+ auto alphaMask = vAlphaValue < int_v(0);
+ vAlphaValue = xsimd::set_zero(vAlphaValue, alphaMask);
- vCurvedData.gather(curveDataPointer, vAlphaValue);
- vCurvedData1.gather(curveDataPointer, vAlphaValue + 1);
- // Vc::float_v vCurvedData1(curveDataPointer,vAlphaValue + 1);
+ vCurvedData = float_v::gather(curveDataPointer, vAlphaValue);
+ vCurvedData1 = float_v::gather(curveDataPointer, vAlphaValue + 1);
// vAlpha
- Vc::float_v fullFade = ((vOne - alphaValueF) * vCurvedData + alphaValueF * vCurvedData1);
+ float_v fullFade = ((vOne - alphaValueF) * vCurvedData + alphaValueF * vCurvedData1);
- Vc::float_m mask;
// Mask in the inner circle of the mask
- mask = fullFade < vZero;
- fullFade.setZero(mask);
+ float_m mask = fullFade < vZero;
+ fullFade = xsimd::set_zero(fullFade, mask);
// Mask outer circle of mask
mask = fullFade >= vOne;
- Vc::float_v vFade = (vOne - fullFade);
- vFade.setZero(mask);
+ float_v vFade = (vOne - fullFade);
+ vFade = xsimd::set_zero(vFade, mask);
// return original dist values before vFade transform
- vFade(excludeMask) = dist;
- vFade.store(bufferPointer, Vc::Aligned);
+ vFade = xsimd::select(excludeMask, dist, vFade);
+ vFade.store_aligned(bufferPointer);
} else {
- dist.store(bufferPointer, Vc::Aligned);
+ dist.store_aligned(bufferPointer);
}
currentIndices = currentIndices + increment;
- bufferPointer += Vc::float_v::size();
+ bufferPointer += float_v::size;
}
}
template<>
template<>
-void FastRowProcessor<KisRectangleMaskGenerator>::process<Vc::CurrentImplementation::current()>(float *buffer,
- int width,
- float y,
- float cosa,
- float sina,
- float centerX,
- float centerY)
+void FastRowProcessor<KisRectangleMaskGenerator>::process<xsimd::current_arch>(float *buffer,
+ int width,
+ float y,
+ float cosa,
+ float sina,
+ float centerX,
+ float centerY)
{
+ using float_v = xsimd::batch<float, xsimd::current_arch>;
+ using float_m = float_v::batch_bool_type;
+
const bool useSmoothing = d->copyOfAntialiasEdges;
float y_ = y - centerY;
@@ -290,165 +300,168 @@ void FastRowProcessor<KisRectangleMaskGenerator>::process<Vc::CurrentImplementat
float *bufferPointer = buffer;
- Vc::float_v currentIndices = Vc::float_v::IndexesFromZero();
+ float_v currentIndices = xsimd::detail::make_sequence_as_batch<float_v>();
- Vc::float_v increment((float)Vc::float_v::size());
- Vc::float_v vCenterX(centerX);
+ float_v increment((float)float_v::size);
+ float_v vCenterX(centerX);
- Vc::float_v vCosa(cosa);
- Vc::float_v vSina(sina);
- Vc::float_v vCosaY_(cosay_);
- Vc::float_v vSinaY_(sinay_);
+ float_v vCosa(cosa);
+ float_v vSina(sina);
+ float_v vCosaY_(cosay_);
+ float_v vSinaY_(sinay_);
- Vc::float_v vXCoeff(static_cast<float>(d->xcoeff));
- Vc::float_v vYCoeff(static_cast<float>(d->ycoeff));
+ float_v vXCoeff(static_cast<float>(d->xcoeff));
+ float_v vYCoeff(static_cast<float>(d->ycoeff));
- Vc::float_v vTransformedFadeX(static_cast<float>(d->transformedFadeX));
- Vc::float_v vTransformedFadeY(static_cast<float>(d->transformedFadeY));
+ float_v vTransformedFadeX(static_cast<float>(d->transformedFadeX));
+ float_v vTransformedFadeY(static_cast<float>(d->transformedFadeY));
- Vc::float_v vOne(Vc::One);
- Vc::float_v vZero(Vc::Zero);
- Vc::float_v vTolerance(10000.f);
+ float_v vOne(1);
+ float_v vZero(0);
+ float_v vTolerance(10000.f);
- for (size_t i = 0; i < static_cast<size_t>(width); i += Vc::float_v::size()) {
- Vc::float_v x_ = currentIndices - vCenterX;
+ for (size_t i = 0; i < static_cast<size_t>(width); i += float_v::size) {
+ float_v x_ = currentIndices - vCenterX;
- Vc::float_v xr = Vc::abs(x_ * vCosa - vSinaY_);
- Vc::float_v yr = Vc::abs(x_ * vSina + vCosaY_);
+ float_v xr = xsimd::abs(x_ * vCosa - vSinaY_);
+ float_v yr = xsimd::abs(x_ * vSina + vCosaY_);
- Vc::float_v nxr = xr * vXCoeff;
- Vc::float_v nyr = yr * vYCoeff;
+ float_v nxr = xr * vXCoeff;
+ float_v nyr = yr * vYCoeff;
- Vc::float_m outsideMask = (nxr > vOne) || (nyr > vOne);
+ float_m outsideMask = (nxr > vOne) || (nyr > vOne);
- if (!outsideMask.isFull()) {
+ if (!xsimd::all(outsideMask)) {
if (useSmoothing) {
- xr = Vc::abs(xr) + vOne;
- yr = Vc::abs(yr) + vOne;
+ xr = xsimd::abs(xr) + vOne;
+ yr = xsimd::abs(yr) + vOne;
}
- Vc::float_v fxr = xr * vTransformedFadeX;
- Vc::float_v fyr = yr * vTransformedFadeY;
+ float_v fxr = xr * vTransformedFadeX;
+ float_v fyr = yr * vTransformedFadeY;
- Vc::float_v fxrNorm = nxr * (fxr - vOne) / (fxr - nxr);
- Vc::float_v fyrNorm = nyr * (fyr - vOne) / (fyr - nyr);
+ float_v fxrNorm = nxr * (fxr - vOne) / (fxr - nxr);
+ float_v fyrNorm = nyr * (fyr - vOne) / (fyr - nyr);
- Vc::float_v vFade(vZero);
+ float_v vFade(vZero);
- Vc::float_m vFadeMask = fxrNorm < fyrNorm;
- Vc::float_v vMaxVal = vFade;
- vMaxVal(fxr > vOne) = fxrNorm;
- vMaxVal(vFadeMask && fyr > vOne) = fyrNorm;
+ float_m vFadeMask = fxrNorm < fyrNorm;
+ float_v vMaxVal = vFade;
+ vMaxVal = xsimd::select(fxr > vOne, fxrNorm, vMaxVal);
+ vMaxVal = xsimd::select(vFadeMask && fyr > vOne, fyrNorm, vMaxVal);
vFade = vMaxVal;
// Mask out the outer circle of the mask
- vFade(outsideMask) = vOne;
- vFade.store(bufferPointer, Vc::Aligned);
+ vFade = xsimd::select(outsideMask, vOne, vFade);
+ vFade.store_aligned(bufferPointer);
} else {
// Mask out everything outside the circle
- vOne.store(bufferPointer, Vc::Aligned);
+ vOne.store_aligned(bufferPointer);
}
currentIndices = currentIndices + increment;
- bufferPointer += Vc::float_v::size();
+ bufferPointer += float_v::size;
}
}
template<>
template<>
-void FastRowProcessor<KisGaussRectangleMaskGenerator>::process<Vc::CurrentImplementation::current()>(float *buffer,
- int width,
- float y,
- float cosa,
- float sina,
- float centerX,
- float centerY)
+void FastRowProcessor<KisGaussRectangleMaskGenerator>::process<xsimd::current_arch>(float *buffer,
+ int width,
+ float y,
+ float cosa,
+ float sina,
+ float centerX,
+ float centerY)
{
+ using float_v = xsimd::batch<float, xsimd::current_arch>;
+ using float_m = float_v::batch_bool_type;
+
float y_ = y - centerY;
float sinay_ = sina * y_;
float cosay_ = cosa * y_;
float *bufferPointer = buffer;
- Vc::float_v currentIndices = Vc::float_v::IndexesFromZero();
+ float_v currentIndices = xsimd::detail::make_sequence_as_batch<float_v>();
- Vc::float_v increment((float)Vc::float_v::size());
- Vc::float_v vCenterX(centerX);
+ float_v increment((float)float_v::size);
+ float_v vCenterX(centerX);
- Vc::float_v vCosa(cosa);
- Vc::float_v vSina(sina);
- Vc::float_v vCosaY_(cosay_);
- Vc::float_v vSinaY_(sinay_);
+ float_v vCosa(cosa);
+ float_v vSina(sina);
+ float_v vCosaY_(cosay_);
+ float_v vSinaY_(sinay_);
- Vc::float_v vhalfWidth(static_cast<float>(d->halfWidth));
- Vc::float_v vhalfHeight(static_cast<float>(d->halfHeight));
- Vc::float_v vXFade(static_cast<float>(d->xfade));
- Vc::float_v vYFade(static_cast<float>(d->yfade));
+ float_v vhalfWidth(static_cast<float>(d->halfWidth));
+ float_v vhalfHeight(static_cast<float>(d->halfHeight));
+ float_v vXFade(static_cast<float>(d->xfade));
+ float_v vYFade(static_cast<float>(d->yfade));
- Vc::float_v vAlphafactor(static_cast<float>(d->alphafactor));
+ float_v vAlphafactor(static_cast<float>(d->alphafactor));
- Vc::float_v vOne(Vc::One);
- Vc::float_v vZero(Vc::Zero);
- Vc::float_v vValMax(255.f);
+ float_v vOne(1);
+ float_v vZero(0);
+ float_v vValMax(255.f);
- for (size_t i = 0; i < static_cast<size_t>(width); i += Vc::float_v::size()) {
- Vc::float_v x_ = currentIndices - vCenterX;
+ for (size_t i = 0; i < static_cast<size_t>(width); i += float_v::size) {
+ float_v x_ = currentIndices - vCenterX;
- Vc::float_v xr = x_ * vCosa - vSinaY_;
- Vc::float_v yr = Vc::abs(x_ * vSina + vCosaY_);
+ float_v xr = x_ * vCosa - vSinaY_;
+ float_v yr = xsimd::abs(x_ * vSina + vCosaY_);
- Vc::float_v vValue;
+ float_v vValue;
// check if we need to apply fader on values
- Vc::float_m excludeMask = d->fadeMaker.needFade(xr, yr);
- vValue(excludeMask) = vOne;
+ float_m excludeMask = d->fadeMaker.needFade(xr, yr);
+ vValue = xsimd::select(excludeMask, vOne, vValue);
- if (!excludeMask.isFull()) {
- Vc::float_v fullFade = vValMax
- - (vAlphafactor
- * (VcExtraMath::erf((vhalfWidth + xr) * vXFade) + VcExtraMath::erf((vhalfWidth - xr) * vXFade))
+ if (!xsimd::all(excludeMask)) {
+ float_v fullFade = vValMax
+ - (vAlphafactor * (VcExtraMath::erf((vhalfWidth + xr) * vXFade) + VcExtraMath::erf((vhalfWidth - xr) * vXFade))
* (VcExtraMath::erf((vhalfHeight + yr) * vYFade) + VcExtraMath::erf((vhalfHeight - yr) * vYFade)));
// apply antialias fader
d->fadeMaker.apply2DFader(fullFade, excludeMask, xr, yr);
- Vc::float_m mask;
-
// Mask in the inner circle of the mask
- mask = fullFade < vZero;
- fullFade.setZero(mask);
+ float_m mask = fullFade < vZero;
+ fullFade = xsimd::set_zero(fullFade, mask);
// Mask the outer circle
mask = fullFade > 254.974f;
- fullFade(mask) = vValMax;
+ fullFade = xsimd::select(mask, vValMax, fullFade);
// Mask (value - value), precision errors.
- Vc::float_v vFade = fullFade / vValMax;
+ float_v vFade = fullFade / vValMax;
// return original vValue values before vFade transform
- vFade(excludeMask) = vValue;
- vFade.store(bufferPointer, Vc::Aligned);
+ vFade = xsimd::select(excludeMask, vValue, vFade);
+ vFade.store_aligned(bufferPointer);
} else {
- vValue.store(bufferPointer, Vc::Aligned);
+ vValue.store_aligned(bufferPointer);
}
currentIndices = currentIndices + increment;
- bufferPointer += Vc::float_v::size();
+ bufferPointer += float_v::size;
}
}
template<>
template<>
-void FastRowProcessor<KisCurveRectangleMaskGenerator>::process<Vc::CurrentImplementation::current()>(float *buffer,
- int width,
- float y,
- float cosa,
- float sina,
- float centerX,
- float centerY)
+void FastRowProcessor<KisCurveRectangleMaskGenerator>::process<xsimd::current_arch>(float *buffer,
+ int width,
+ float y,
+ float cosa,
+ float sina,
+ float centerX,
+ float centerY)
{
+ using float_v = xsimd::batch<float, xsimd::current_arch>;
+ using float_m = float_v::batch_bool_type;
+
float y_ = y - centerY;
float sinay_ = sina * y_;
float cosay_ = cosa * y_;
@@ -457,57 +470,57 @@ void FastRowProcessor<KisCurveRectangleMaskGenerator>::process<Vc::CurrentImplem
qreal *curveDataPointer = d->curveData.data();
- Vc::float_v currentIndices = Vc::float_v::IndexesFromZero();
+ float_v currentIndices = xsimd::detail::make_sequence_as_batch<float_v>();
- Vc::float_v increment((float)Vc::float_v::size());
- Vc::float_v vCenterX(centerX);
+ float_v increment((float)float_v::size);
+ float_v vCenterX(centerX);
- Vc::float_v vCosa(cosa);
- Vc::float_v vSina(sina);
- Vc::float_v vCosaY_(cosay_);
- Vc::float_v vSinaY_(sinay_);
+ float_v vCosa(cosa);
+ float_v vSina(sina);
+ float_v vCosaY_(cosay_);
+ float_v vSinaY_(sinay_);
- Vc::float_v vYCoeff(static_cast<float>(d->ycoeff));
- Vc::float_v vXCoeff(static_cast<float>(d->xcoeff));
- Vc::float_v vCurveResolution(static_cast<float>(d->curveResolution));
+ float_v vYCoeff(static_cast<float>(d->ycoeff));
+ float_v vXCoeff(static_cast<float>(d->xcoeff));
+ float_v vCurveResolution(static_cast<float>(d->curveResolution));
- Vc::float_v vOne(Vc::One);
- Vc::float_v vZero(Vc::Zero);
- Vc::float_v vValMax(255.f);
+ float_v vOne(1);
+ float_v vZero(0);
+ float_v vValMax(255.f);
- for (size_t i = 0; i < static_cast<size_t>(width); i += Vc::float_v::size()) {
- Vc::float_v x_ = currentIndices - vCenterX;
+ for (size_t i = 0; i < static_cast<size_t>(width); i += float_v::size) {
+ float_v x_ = currentIndices - vCenterX;
- Vc::float_v xr = x_ * vCosa - vSinaY_;
- Vc::float_v yr = Vc::abs(x_ * vSina + vCosaY_);
+ float_v xr = x_ * vCosa - vSinaY_;
+ float_v yr = xsimd::abs(x_ * vSina + vCosaY_);
- Vc::float_v vValue;
+ float_v vValue;
// check if we need to apply fader on values
- Vc::float_m excludeMask = d->fadeMaker.needFade(xr, yr);
- vValue(excludeMask) = vOne;
+ float_m excludeMask = d->fadeMaker.needFade(xr, yr);
+ vValue = xsimd::select(excludeMask, vOne, vValue);
- if (!excludeMask.isFull()) {
+ if (!xsimd::all(excludeMask)) {
// We need to mask the extra area given for aliniation
// the next operation should never give values above 1
- Vc::float_v preSIndex = Vc::abs(xr) * vXCoeff;
- Vc::float_v preTIndex = Vc::abs(yr) * vYCoeff;
+ float_v preSIndex = xsimd::abs(xr) * vXCoeff;
+ float_v preTIndex = xsimd::abs(yr) * vYCoeff;
- preSIndex(preSIndex > vOne) = vOne;
- preTIndex(preTIndex > vOne) = vOne;
+ preSIndex = xsimd::select(preSIndex > vOne, vOne, preSIndex);
+ preTIndex = xsimd::select(preTIndex > vOne, vOne, preTIndex);
- Vc::float_v::IndexType sIndex(round(preSIndex * vCurveResolution));
- Vc::float_v::IndexType tIndex(round(preTIndex * vCurveResolution));
+ const auto sIndex = xsimd::nearbyint_as_int(preSIndex * vCurveResolution);
+ const auto tIndex = xsimd::nearbyint_as_int(preTIndex * vCurveResolution);
- Vc::float_v::IndexType sIndexInverted = vCurveResolution - sIndex;
- Vc::float_v::IndexType tIndexInverted = vCurveResolution - tIndex;
+ auto sIndexInverted = xsimd::to_int(vCurveResolution - xsimd::to_float(sIndex));
+ auto tIndexInverted = xsimd::to_int(vCurveResolution - xsimd::to_float(tIndex));
- Vc::float_v vCurvedDataSIndex(curveDataPointer, sIndex);
- Vc::float_v vCurvedDataTIndex(curveDataPointer, tIndex);
- Vc::float_v vCurvedDataSIndexInv(curveDataPointer, sIndexInverted);
- Vc::float_v vCurvedDataTIndexInv(curveDataPointer, tIndexInverted);
+ const auto vCurvedDataSIndex = float_v::gather(curveDataPointer, sIndex);
+ const auto vCurvedDataTIndex = float_v::gather(curveDataPointer, tIndex);
+ const auto vCurvedDataSIndexInv = float_v::gather(curveDataPointer, sIndexInverted);
+ const auto vCurvedDataTIndexInv = float_v::gather(curveDataPointer, tIndexInverted);
- Vc::float_v fullFade = vValMax
+ float_v fullFade = vValMax
* (vOne
- (vCurvedDataSIndex * (vOne - vCurvedDataSIndexInv) * vCurvedDataTIndex
* (vOne - vCurvedDataTIndexInv)));
@@ -515,30 +528,28 @@ void FastRowProcessor<KisCurveRectangleMaskGenerator>::process<Vc::CurrentImplem
// apply antialias fader
d->fadeMaker.apply2DFader(fullFade, excludeMask, xr, yr);
- Vc::float_m mask;
-
// Mask in the inner circle of the mask
- mask = fullFade < vZero;
- fullFade.setZero(mask);
+ float_m mask = fullFade < vZero;
+ fullFade = xsimd::set_zero(fullFade, mask);
// Mask the outer circle
mask = fullFade > 254.974f;
- fullFade(mask) = vValMax;
+ fullFade = xsimd::select(mask, vValMax, fullFade);
// Mask (value - value), precision errors.
- Vc::float_v vFade = fullFade / vValMax;
+ float_v vFade = fullFade / vValMax;
// return original vValue values before vFade transform
- vFade(excludeMask) = vValue;
- vFade.store(bufferPointer, Vc::Aligned);
+ vFade = xsimd::select(excludeMask, vValue, vFade);
+ vFade.store_aligned(bufferPointer);
} else {
- vValue.store(bufferPointer, Vc::Aligned);
+ vValue.store_aligned(bufferPointer);
}
currentIndices = currentIndices + increment;
- bufferPointer += Vc::float_v::size();
+ bufferPointer += float_v::size;
}
}
-#endif /* defined HAVE_VC */
+#endif /* defined HAVE_XSIMD */
diff --git a/libs/image/kis_brush_mask_scalar_applicator.h b/libs/image/kis_brush_mask_scalar_applicator.h
index 51ca1a0656..8f7ff59362 100644
--- a/libs/image/kis_brush_mask_scalar_applicator.h
+++ b/libs/image/kis_brush_mask_scalar_applicator.h
@@ -16,7 +16,7 @@
// 3x3 supersampling
#define SUPERSAMPLING 3
-template<class MaskGenerator, Vc::Implementation impl>
+template<class MaskGenerator, typename impl>
struct KisBrushMaskScalarApplicator : public KisBrushMaskApplicatorBase {
KisBrushMaskScalarApplicator(MaskGenerator *maskGenerator)
: m_maskGenerator(maskGenerator)
diff --git a/libs/image/kis_brush_mask_vector_applicator.h b/libs/image/kis_brush_mask_vector_applicator.h
index dc53a369a1..8e9ed721b4 100644
--- a/libs/image/kis_brush_mask_vector_applicator.h
+++ b/libs/image/kis_brush_mask_vector_applicator.h
@@ -9,11 +9,9 @@
#ifndef KIS_BRUSH_VECTOR_APPLICATOR_H
#define KIS_BRUSH_VECTOR_APPLICATOR_H
-#include <config-vc.h>
+#include <xsimd_extensions/xsimd.hpp>
-#if defined HAVE_VC
-
-#include <KoVcMultiArchBuildSupport.h>
+#if defined HAVE_XSIMD
#include "kis_brush_mask_scalar_applicator.h"
@@ -24,13 +22,13 @@ struct FastRowProcessor {
{
}
- template<Vc::Implementation _impl>
+ template<typename _impl>
void process(float *buffer, int width, float y, float cosa, float sina, float centerX, float centerY);
typename V::Private *d;
};
-template<class MaskGenerator, Vc::Implementation _impl>
+template<class MaskGenerator, typename _impl>
struct KisBrushMaskVectorApplicator : public KisBrushMaskScalarApplicator<MaskGenerator, _impl> {
KisBrushMaskVectorApplicator(MaskGenerator *maskGenerator)
: KisBrushMaskScalarApplicator<MaskGenerator, _impl>(maskGenerator)
@@ -46,18 +44,18 @@ protected:
void processVector(const QRect &rect);
private:
- template<class U, Vc::Implementation V>
+ template<class U, typename V>
struct TypeHelper {
};
private:
template<class U>
- inline void startProcessing(const QRect &rect, TypeHelper<U, Vc::ScalarImpl>)
+ inline void startProcessing(const QRect &rect, TypeHelper<U, xsimd::generic>)
{
KisBrushMaskScalarApplicator<MaskGenerator, _impl>::processScalar(rect);
}
- template<class U, Vc::Implementation V>
+ template<class U, typename V>
inline void startProcessing(const QRect &rect, TypeHelper<U, V>)
{
MaskGenerator *m_maskGenerator = KisBrushMaskScalarApplicator<MaskGenerator, _impl>::m_maskGenerator;
@@ -70,9 +68,11 @@ private:
}
};
-template<class MaskGenerator, Vc::Implementation impl>
+template<class MaskGenerator, typename impl>
void KisBrushMaskVectorApplicator<MaskGenerator, impl>::processVector(const QRect &rect)
{
+ using float_v = xsimd::batch<float, impl>;
+
const MaskProcessingData *m_d = KisBrushMaskApplicatorBase::m_d;
MaskGenerator *m_maskGenerator = KisBrushMaskScalarApplicator<MaskGenerator, impl>::m_maskGenerator;
@@ -86,12 +86,12 @@ void KisBrushMaskVectorApplicator<MaskGenerator, impl>::processVector(const QRec
// We need to calculate with a multiple of the width of the simd register
size_t alignOffset = 0;
- if (width % Vc::float_v::size() != 0) {
- alignOffset = Vc::float_v::size() - (width % Vc::float_v::size());
+ if (width % float_v::size != 0) {
+ alignOffset = float_v::size - (width % float_v::size);
}
size_t simdWidth = width + alignOffset;
- float *buffer = Vc::malloc<float, Vc::AlignOnCacheline>(simdWidth);
+ auto *buffer =xsimd::vector_aligned_malloc<float>(simdWidth);
FastRowProcessor<MaskGenerator> processor(m_maskGenerator);
@@ -135,9 +135,9 @@ void KisBrushMaskVectorApplicator<MaskGenerator, impl>::processVector(const QRec
} // endfor x
dabPointer += offset;
} // endfor y
- Vc::free(buffer);
+ xsimd::vector_aligned_free(buffer);
}
-#endif /* defined HAVE_VC */
+#endif /* defined HAVE_XSIMD */
#endif /* KIS_BRUSH_VECTOR_APPLICATOR_H */
diff --git a/libs/image/kis_circle_mask_generator.h b/libs/image/kis_circle_mask_generator.h
index fa55bcdecb..11f85ef732 100644
--- a/libs/image/kis_circle_mask_generator.h
+++ b/libs/image/kis_circle_mask_generator.h
@@ -14,7 +14,7 @@
#include <QScopedPointer>
template <typename V>
-class FastRowProcessor;
+struct FastRowProcessor;
/**
* Create, serialize and deserialize an elliptical 8-bit mask.
@@ -48,7 +48,7 @@ private:
struct Private;
const QScopedPointer<Private> d;
- friend class FastRowProcessor<KisCircleMaskGenerator>;
+ friend struct FastRowProcessor<KisCircleMaskGenerator>;
};
#endif
diff --git a/libs/image/kis_curve_circle_mask_generator.h b/libs/image/kis_curve_circle_mask_generator.h
index 205d76e7ca..d7f919fea3 100644
--- a/libs/image/kis_curve_circle_mask_generator.h
+++ b/libs/image/kis_curve_circle_mask_generator.h
@@ -22,7 +22,7 @@ class QDomDocument;
class QPointF;
template<typename V>
-class FastRowProcessor;
+struct FastRowProcessor;
/**
* This mask generator use softness/hardness defined by user curve
@@ -61,7 +61,7 @@ private:
struct Private;
const QScopedPointer<Private> d;
- friend class FastRowProcessor<KisCurveCircleMaskGenerator>;
+ friend struct FastRowProcessor<KisCurveCircleMaskGenerator>;
};
#endif
diff --git a/libs/image/kis_curve_rect_mask_generator.h b/libs/image/kis_curve_rect_mask_generator.h
index 4f05d63b54..ef0059a7bc 100644
--- a/libs/image/kis_curve_rect_mask_generator.h
+++ b/libs/image/kis_curve_rect_mask_generator.h
@@ -17,7 +17,7 @@ class QDomDocument;
#include "kis_base_mask_generator.h"
template<typename V>
-class FastRowProcessor;
+struct FastRowProcessor;
/**
* Curve based softness for this rectangular mask generator
@@ -47,7 +47,7 @@ private:
struct Private;
const QScopedPointer<Private> d;
- friend class FastRowProcessor<KisCurveRectangleMaskGenerator>;
+ friend struct FastRowProcessor<KisCurveRectangleMaskGenerator>;
};
#endif
diff --git a/libs/image/kis_gauss_circle_mask_generator.h b/libs/image/kis_gauss_circle_mask_generator.h
index ab431e81a6..48ebe59846 100644
--- a/libs/image/kis_gauss_circle_mask_generator.h
+++ b/libs/image/kis_gauss_circle_mask_generator.h
@@ -15,7 +15,7 @@
#include <QScopedPointer>
template<typename V>
-class FastRowProcessor;
+struct FastRowProcessor;
/**
* This mask generator uses a Gaussian-blurred circle
@@ -47,7 +47,7 @@ private:
struct Private;
const QScopedPointer<Private> d;
- friend class FastRowProcessor<KisGaussCircleMaskGenerator>;
+ friend struct FastRowProcessor<KisGaussCircleMaskGenerator>;
};
#endif
diff --git a/libs/image/kis_gauss_rect_mask_generator.h b/libs/image/kis_gauss_rect_mask_generator.h
index 1594e80f48..ebd0fc40a5 100644
--- a/libs/image/kis_gauss_rect_mask_generator.h
+++ b/libs/image/kis_gauss_rect_mask_generator.h
@@ -14,7 +14,7 @@
#include "kis_base_mask_generator.h"
template<typename V>
-class FastRowProcessor;
+struct FastRowProcessor;
/**
* This mask generator uses a Gaussian-blurred rectangle
@@ -39,7 +39,7 @@ private:
struct Private;
const QScopedPointer<Private> d;
- friend class FastRowProcessor<KisGaussRectangleMaskGenerator>;
+ friend struct FastRowProcessor<KisGaussRectangleMaskGenerator>;
};
#endif
diff --git a/libs/image/kis_rect_mask_generator.h b/libs/image/kis_rect_mask_generator.h
index 01b8210994..13d7edc0e6 100644
--- a/libs/image/kis_rect_mask_generator.h
+++ b/libs/image/kis_rect_mask_generator.h
@@ -16,7 +16,7 @@
#include "kis_base_mask_generator.h"
template<typename V>
-class FastRowProcessor;
+struct FastRowProcessor;
/**
* Represent, serialize and deserialize a rectangular 8-bit mask.
@@ -43,7 +43,7 @@ private:
struct Private;
const QScopedPointer<Private> d;
- friend class FastRowProcessor<KisRectangleMaskGenerator>;
+ friend struct FastRowProcessor<KisRectangleMaskGenerator>;
};
#endif
diff --git a/libs/image/tests/CMakeLists.txt b/libs/image/tests/CMakeLists.txt
index 10a6355eb0..211d5cef19 100644
--- a/libs/image/tests/CMakeLists.txt
+++ b/libs/image/tests/CMakeLists.txt
@@ -24,10 +24,6 @@ include_Directories(SYSTEM
${EIGEN3_INCLUDE_DIR}
)
-if(HAVE_VC)
- include_directories(${Vc_INCLUDE_DIR})
-endif()
-
include(ECMAddTests)
macro_add_unittest_definitions()
diff --git a/libs/image/vc_extra_math.h b/libs/image/vc_extra_math.h
index edd306291d..26729bced4 100644
--- a/libs/image/vc_extra_math.h
+++ b/libs/image/vc_extra_math.h
@@ -1,5 +1,6 @@
/*
* SPDX-FileCopyrightText: 2018 Iván Santa MarÃa <ghevan at gmail.com>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
@@ -7,40 +8,42 @@
#ifndef VC_ADDITIONAL_MATH_H
#define VC_ADDITIONAL_MATH_H
-#include <config-vc.h>
+#include <xsimd_extensions/xsimd.hpp>
-#if defined HAVE_VC
-
-#include <Vc/Vc>
-#include <Vc/IO>
+#if defined HAVE_XSIMD
class VcExtraMath
{
public:
+
// vectorized erf function, precision 1e-5
- static Vc_ALWAYS_INLINE Vc::float_v erf(Vc::float_v::AsArg x) {
- Vc::float_v xa = abs(x);
- Vc::float_m precisionLimit(xa >= 9.3f); // wrong result for any number beyond this
- xa(precisionLimit) = 0;
- Vc::float_v sign(Vc::One);
- Vc::float_m invertMask = x < 0.f;
- sign(invertMask) = -1.f;
+ template<typename A>
+ static inline xsimd::batch<float, A> erf(const xsimd::batch<float, A> x)
+ {
+ using float_v = xsimd::batch<float, A>;
+ using float_m = typename float_v::batch_bool_type;
+ float_v xa = xsimd::abs(x);
+ float_m precisionLimit = xa >= float_v(9.3f); // wrong result for any number beyond this
+ xa = xsimd::set_zero(xa, precisionLimit);
+ float_v sign(1.0f);
+ float_m invertMask = x < float_v(0.f);
+ sign = xsimd::select(invertMask, float_v(-1.f), sign);
// CONSTANTS
- float a1 = 0.254829592;
- float a2 = -0.284496736;
- float a3 = 1.421413741;
- float a4 = -1.453152027;
- float a5 = 1.061405429;
- float p = 0.3275911;
-
- Vc::float_v t = 1.0f / (1.0f + p * xa);
- Vc::float_v y = 1.0f - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-xa * xa);
- y(precisionLimit) = 1.0f;
+ float a1 = 0.254829592f;
+ float a2 = -0.284496736f;
+ float a3 = 1.421413741f;
+ float a4 = -1.453152027f;
+ float a5 = 1.061405429f;
+ float p = 0.3275911f;
+
+ float_v t = 1.0f / (1.0f + p * xa);
+ float_v y = 1.0f - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-xa * xa);
+ y = xsimd::set_one(y, precisionLimit);
return sign * y;
}
};
-#endif /* defined HAVE_VC */
+#endif /* defined HAVE_XSIMD */
#endif // VC_ADDITIONAL_MATH_H
diff --git a/libs/multiarch/CMakeLists.txt b/libs/multiarch/CMakeLists.txt
new file mode 100644
index 0000000000..01104c3247
--- /dev/null
+++ b/libs/multiarch/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_library(kritamultiarch INTERFACE)
+
+if (HAVE_XSIMD)
+ target_link_libraries(kritamultiarch INTERFACE xsimd)
+endif()
+
+set_target_properties(kritamultiarch PROPERTIES
+ INTERFACE_INCLUDE_DIRECTORIES $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/>
+)
diff --git a/libs/multiarch/xsimd_extensions/arch/xsimd_generic.hpp b/libs/multiarch/xsimd_extensions/arch/xsimd_generic.hpp
new file mode 100644
index 0000000000..aaddaaa382
--- /dev/null
+++ b/libs/multiarch/xsimd_extensions/arch/xsimd_generic.hpp
@@ -0,0 +1,133 @@
+/*
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef KIS_XSIMD_GENERIC_HPP
+#define KIS_XSIMD_GENERIC_HPP
+
+#include "xsimd_generic_details.hpp"
+
+#include <array>
+#include <type_traits>
+
+namespace xsimd
+{
+/***********************
+ * Truncate-initialize *
+ ***********************/
+
+template<typename V, typename T, typename A>
+inline batch<T, A> truncate_to_type(xsimd::batch<T, A> const &self) noexcept
+{
+ return kernel::detail::apply_with_value(
+ [](float i) -> float {
+ if (std::numeric_limits<V>::min() > i) {
+ return 0;
+ } else if (std::numeric_limits<V>::max() < i) {
+ return 0;
+ } else {
+ return static_cast<V>(i);
+ }
+ },
+ self);
+}
+
+// Mask to 0 elements of a vector.
+template<typename T, typename A>
+inline auto set_zero(const batch<T, A> &src, const batch_bool<T, A> &mask) noexcept
+{
+ return xsimd::select(mask, xsimd::batch<T, A>(0), src);
+}
+
+// Mask to 1 elements of a vector.
+template<typename T, typename A>
+inline auto set_one(const batch<T, A> &src, const batch_bool<T, A> &mask) noexcept
+{
+ return xsimd::select(mask, xsimd::batch<T, A>(1), src);
+}
+
+/**********************************
+ * Sign-extending unaligned loads *
+ **********************************/
+
+// Load `T::size` values from the array of `T2` elements.
+template<typename T, typename T2>
+inline T load_and_extend(const T2 *src) noexcept
+{
+ return kernel::detail::apply_with_index_and_value(
+ [&](size_t i, typename T::value_type) {
+ return static_cast<typename T::value_type>(src[i]);
+ },
+ T{});
+}
+
+/*************************************************
+ * Type-inferred, auto-aligned memory allocation *
+ *************************************************/
+
+// Allocate size bytes of memory aligned to `batch<T, A>::alignment()`.
+template<typename T, typename A>
+inline T *aligned_malloc(size_t size) noexcept
+{
+ using T_v = batch<T, A>;
+
+ return reinterpret_cast<T *>(xsimd::aligned_malloc(size, T_v::arch_type::alignment()));
+}
+
+// Return the maximum of a list of templated values at compile time.
+template<size_t value, size_t... values>
+constexpr typename std::enable_if<sizeof...(values) == 0, size_t>::type max()
+{
+ return value;
+}
+
+// Return the maximum of a list of templated values at compile time.
+template<size_t value, size_t... values>
+constexpr typename std::enable_if<sizeof...(values) != 0, size_t>::type max()
+{
+ return std::max(value, max<values...>());
+}
+
+// Allocate memory for `sz` T items, aligned to the selected architecture's
+// alignment.
+template<typename T, typename A>
+inline T *vector_aligned_malloc(size_t sz) noexcept
+{
+ return static_cast<T *>(xsimd::aligned_malloc(sz * sizeof(T), A::alignment()));
+}
+
+// Free allocated memory, hiding the `const_cast` if necessary.
+template<typename T>
+inline void vector_aligned_free(const T *ptr) noexcept
+{
+ // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+ return xsimd::aligned_free(const_cast<T *>(ptr));
+}
+
+/****************
+ * Interleaving *
+ ****************/
+
+// Return the tuple of interleaved batches `a` and `b`.
+// First element is the low half, second is the upper half.
+template<typename V>
+inline std::pair<V, V> interleave(const V &a, const V &b) noexcept
+{
+ return {xsimd::zip_lo(a, b), xsimd::zip_hi(a, b)};
+}
+
+/**********************
+ * Quadratic function *
+ **********************/
+
+template<typename T, typename A>
+inline batch<T, A> pow2 (batch<T, A> const& self) noexcept
+{
+ return self * self;
+}
+
+}; // namespace xsimd
+
+#endif
diff --git a/libs/multiarch/xsimd_extensions/arch/xsimd_generic_details.hpp b/libs/multiarch/xsimd_extensions/arch/xsimd_generic_details.hpp
new file mode 100644
index 0000000000..ec98b4fdef
--- /dev/null
+++ b/libs/multiarch/xsimd_extensions/arch/xsimd_generic_details.hpp
@@ -0,0 +1,105 @@
+/*
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef KIS_XSIMD_GENERIC_DETAILS_DECL_HPP
+#define KIS_XSIMD_GENERIC_DETAILS_DECL_HPP
+
+#include <cstdint>
+#include <tuple>
+
+namespace xsimd
+{
+/***********************
+ * Truncate-initialize *
+ ***********************/
+
+template<typename V, typename T, typename A>
+batch<T, A> truncate_to_type(batch<T, A> const &self) noexcept;
+
+/**************************
+ * Masked initializations *
+ **************************/
+
+// Mask to 0 elements of a vector.
+template<typename T, typename A>
+inline auto set_zero(const batch<T, A> &src, const batch_bool<T, A> &mask) noexcept;
+
+// Mask to 1 elements of a vector.
+template<typename T, typename A>
+inline auto set_one(const batch<T, A> &src, const batch_bool<T, A> &mask) noexcept;
+
+/**********************************
+ * Sign-extending unaligned loads *
+ **********************************/
+
+// Load `T::size` values from the array of `T2` elements.
+template<typename T, typename T2>
+inline T load_and_extend(const T2 *src) noexcept;
+
+/*************************************************
+ * Type-inferred, auto-aligned memory allocation *
+ *************************************************/
+
+// Allocate size bytes of memory aligned to `batch<T, A>::alignment()`.
+template<typename T, typename A = xsimd::current_arch>
+inline T *aligned_malloc(size_t size) noexcept;
+
+// Allocate memory for `sz` T items, aligned to the selected architecture's
+// alignment.
+template<typename T, typename A = xsimd::current_arch>
+inline T *vector_aligned_malloc(size_t sz) noexcept;
+
+// Free allocated memory, hiding the `const_cast` if necessary.
+template<typename T>
+inline void vector_aligned_free(const T *ptr) noexcept;
+
+/****************
+ * Interleaving *
+ ****************/
+
+// Return the tuple of interleaved batches `a` and `b`.
+// First element is the low half, second is the upper half.
+template<typename V>
+inline std::pair<V, V> interleave(const V &a, const V &b) noexcept;
+
+template<typename T, typename A>
+inline xsimd::batch<T, A> pow2(xsimd::batch<T, A> const &self) noexcept;
+
+namespace kernel
+{
+namespace detail
+{
+/*****************************
+ * Helpers: unary applicator *
+ *****************************/
+
+template<class F, class A, class T>
+inline batch<T, A> apply_with_value(F &&func, batch<T, A> const &self) noexcept
+{
+ alignas(A::alignment()) std::array<T, batch<T, A>::size> self_buffer;
+ self.store_aligned(self_buffer.data());
+ for (std::size_t i = 0; i < batch<T, A>::size; ++i) {
+ self_buffer[i] = func(self_buffer[i]);
+ }
+ return batch<T, A>::load_aligned(self_buffer.data());
+}
+
+template<class F, class A, class T>
+inline batch<T, A> apply_with_index_and_value(F &&func, batch<T, A> const &self) noexcept
+{
+ alignas(A::alignment()) std::array<T, batch<T, A>::size> self_buffer;
+ self.store_aligned(self_buffer.data());
+ for (std::size_t i = 0; i < batch<T, A>::size; ++i) {
+ self_buffer[i] = func(i, self_buffer[i]);
+ }
+ return batch<T, A>::load_aligned(self_buffer.data());
+}
+} // namespace kernel
+} // namespace detail
+
+}; // namespace xsimd
+
+#endif // KIS_XSIMD_GENERIC_DETAILS_DECL_HPP
diff --git a/libs/multiarch/xsimd_extensions/arch/xsimd_isa.hpp b/libs/multiarch/xsimd_extensions/arch/xsimd_isa.hpp
new file mode 100644
index 0000000000..a9e3077c9c
--- /dev/null
+++ b/libs/multiarch/xsimd_extensions/arch/xsimd_isa.hpp
@@ -0,0 +1,18 @@
+/*
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef KIS_XSIMD_ISA_HPP
+#define KIS_XSIMD_ISA_HPP
+
+#include <cstdint>
+#include <type_traits>
+
+#include "../config/xsimd_arch.hpp"
+
+// Must come last to have access to all conversion specializations.
+#include "./xsimd_generic.hpp"
+
+#endif
diff --git a/libs/multiarch/xsimd_extensions/config/xsimd_arch.hpp b/libs/multiarch/xsimd_extensions/config/xsimd_arch.hpp
new file mode 100644
index 0000000000..4d71379f98
--- /dev/null
+++ b/libs/multiarch/xsimd_extensions/config/xsimd_arch.hpp
@@ -0,0 +1,106 @@
+/*
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef KIS_XSIMD_ARCH_HPP
+#define KIS_XSIMD_ARCH_HPP
+
+#include "./xsimd_config.hpp"
+
+// Architecture initialization. Borrowed from Vc
+// Define the following strings to a unique integer, which is the only type the
+// preprocessor can compare. This allows to use -DXSIMD_IMPL=SSE3. The
+// preprocessor will then consider XSIMD_IMPL and SSE3 to be equal.
+// An additional define IMPL_MASK allows to detect the FMA extension.
+
+#define Scalar 0x00100000
+#define SSE2 0x00200000
+#define SSE3 0x00300000
+#define SSSE3 0x00400000
+#define SSE4_1 0x00500000
+#define SSE4_2 0x00600000
+#define FMA4 0x00700000
+#define AVX 0x00800000
+#define AVX2 0x00900000
+#define AVX512F 0x00A00000
+#define AVX512BW 0x00B00000
+#define AVX512CD 0x00C00000
+#define AVX512DQ 0x00D00000
+#define NEON 0x00E00000
+#define NEON64 0x00F00000
+
+#define FMA 0x00000001
+
+#define IMPL_MASK 0xFFF00000
+
+namespace xsimd
+{
+#if !defined(HAVE_XSIMD) || defined(XSIMD_IMPL) && (XSIMD_IMPL & IMPL_MASK) == Scalar
+using current_arch = generic;
+#elif !defined(_MSC_VER) || !defined(XSIMD_IMPL)
+using current_arch = default_arch;
+#elif (XSIMD_IMPL & IMPL_MASK) == SSE2
+using current_arch = sse2;
+#elif (XSIMD_IMPL & IMPL_MASK) == SSE3
+using current_arch = sse3;
+#elif (XSIMD_IMPL & IMPL_MASK) == SSSE3
+using current_arch = ssse3;
+#elif (XSIMD_IMPL & IMPL_MASK) == SSE4_1
+using current_arch = sse4_1;
+#elif (XSIMD_IMPL & IMPL_MASK) == SSE4_2
+#if (XSIMD_IMPL & FMA)
+using current_arch = fma3<sse4_2>;
+#else
+using current_arch = sse4_2;
+#endif
+#elif (XSIMD_IMPL & IMPL_MASK) == FMA4
+using current_arch = fma4;
+#elif (XSIMD_IMPL & IMPL_MASK) == AVX
+#if (XSIMD_IMPL & FMA)
+using current_arch = fma3<avx>;
+#else
+using current_arch = avx;
+#endif
+#elif (XSIMD_IMPL & IMPL_MASK) == AVX2
+#if (XSIMD_IMPL & FMA)
+using current_arch = fma3<avx2>;
+#else
+using current_arch = avx2;
+#endif
+#elif (XSIMD_IMPL & IMPL_MASK) == AVX512F
+using current_arch = avx512f;
+#elif (XSIMD_IMPL & IMPL_MASK) == AVX512CD
+using current_arch = avx512cd;
+#elif (XSIMD_IMPL & IMPL_MASK) == AVX512DQ
+using current_arch = avx512dq;
+#elif (XSIMD_IMPL & IMPL_MASK) == AVX512BW
+using current_arch = avx512bw;
+#elif (XSIMD_IMPL & IMPL_MASK) == NEON
+using current_arch = neon;
+#elif (XSIMD_IMPL & IMPL_MASK) == NEON64
+using current_arch = neon64;
+#endif
+}; // namespace xsimd
+
+#undef Scalar
+#undef SSE2
+#undef SSE3
+#undef SSSE3
+#undef SSE4_1
+#undef SSE4_2
+#undef AVX
+#undef AVX2
+#undef AVX512F
+#undef AVX512BW
+#undef AVX512CD
+#undef AVX512DQ
+#undef NEON
+#undef NEON64
+
+#undef FMA3
+#undef FMA4
+#undef IMPL_MASK
+
+#endif // KIS_XSIMD_ARCH_HPP
diff --git a/libs/multiarch/xsimd_extensions/config/xsimd_config.hpp b/libs/multiarch/xsimd_extensions/config/xsimd_config.hpp
new file mode 100644
index 0000000000..abc1aa9132
--- /dev/null
+++ b/libs/multiarch/xsimd_extensions/config/xsimd_config.hpp
@@ -0,0 +1,47 @@
+/*
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef KIS_XSIMD_CONFIG_HPP
+#define KIS_XSIMD_CONFIG_HPP
+
+#include <config-xsimd.h>
+
+#ifdef HAVE_XSIMD
+
+// MSVC patching.
+#if defined(_MSC_VER)
+#if defined(_M_ARM64)
+#ifndef NDEBUG
+#pragma message("Patching over MSVC for aarch64.")
+#endif
+#define __ARM_ARCH 8
+#define __aarch64__ 1
+#define __ARM_NEON 1
+#endif
+
+#if defined(_M_ARM)
+#ifndef NDEBUG
+#pragma message("Patching over MSVC for arm-v7a.")
+#endif
+#define __ARM_ARCH _M_ARM
+#define __ARM_NEON 1
+#endif
+#endif
+
+#include <xsimd/xsimd.hpp>
+
+#else /* HAVE_XSIMD */
+
+namespace xsimd
+{
+class generic
+{
+};
+}; // namespace xsimd
+
+#endif /* HAVE_XSIMD */
+
+#endif // KIS_XSIMD_CONFIG_HPP
diff --git a/libs/multiarch/xsimd_extensions/xsimd.hpp b/libs/multiarch/xsimd_extensions/xsimd.hpp
new file mode 100644
index 0000000000..9fd94e1c9f
--- /dev/null
+++ b/libs/multiarch/xsimd_extensions/xsimd.hpp
@@ -0,0 +1,18 @@
+/*
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef XSIMD_EXTENSIONS_H
+#define XSIMD_EXTENSIONS_H
+
+// xsimd detection and architecture setup.
+#include "config/xsimd_arch.hpp"
+
+#ifdef HAVE_XSIMD
+// xsimd extensions.
+#include "arch/xsimd_isa.hpp"
+#endif // HAVE_XSIMD
+
+#endif // XSIMD_EXTENSIONS_H
diff --git a/libs/pigment/CMakeLists.txt b/libs/pigment/CMakeLists.txt
index fd4482cec0..f60eb98a5c 100644
--- a/libs/pigment/CMakeLists.txt
+++ b/libs/pigment/CMakeLists.txt
@@ -19,9 +19,7 @@ endif()
set(LINK_VC_LIB)
-if(HAVE_VC)
- include_directories(SYSTEM ${Vc_INCLUDE_DIR})
- set(LINK_VC_LIB ${Vc_LIBRARIES})
+if(HAVE_XSIMD)
ko_compile_for_all_implementations_no_scalar(__per_arch_factory_objs compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp)
ko_compile_for_all_implementations(__per_arch_alpha_applicator_factory_objs KoAlphaMaskApplicatorFactoryImpl.cpp)
ko_compile_for_all_implementations(__per_arch_rgb_scaler_factory_objs KoOptimizedPixelDataScalerU8ToU16FactoryImpl.cpp)
@@ -96,7 +94,7 @@ set(kritapigment_SRCS
resources/KisGradientConversion.cpp
)
-set (EXTRA_LIBRARIES ${LINK_OPENEXR_LIB} ${LINK_VC_LIB})
+set (EXTRA_LIBRARIES ${LINK_OPENEXR_LIB} kritamultiarch)
if(MSVC OR (WIN32 AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel"))
# avoid "cannot open file 'LIBC.lib'" error
diff --git a/libs/pigment/KoAlphaMaskApplicator.h b/libs/pigment/KoAlphaMaskApplicator.h
index ccea8d7018..40658d87cf 100644
--- a/libs/pigment/KoAlphaMaskApplicator.h
+++ b/libs/pigment/KoAlphaMaskApplicator.h
@@ -1,5 +1,6 @@
/*
* SPDX-FileCopyrightText: 2020 Dmitry Kazakov <dimula73 at gmail.com>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
@@ -9,13 +10,13 @@
#include "KoAlphaMaskApplicatorBase.h"
#include "KoColorSpaceTraits.h"
-#include "KoVcMultiArchBuildSupport.h"
+#include "KoMultiArchBuildSupport.h"
template<typename _channels_type_,
int _channels_nb_,
int _alpha_pos_,
- Vc::Implementation _impl,
+ typename _impl,
typename EnableDummyType = void>
struct KoAlphaMaskApplicator : public KoAlphaMaskApplicatorBase
{
@@ -49,18 +50,18 @@ struct KoAlphaMaskApplicator : public KoAlphaMaskApplicatorBase
}
};
-#ifdef HAVE_VC
+#ifdef HAVE_XSIMD
#include "KoStreamedMath.h"
-template<Vc::Implementation _impl>
+template<typename _impl>
struct KoAlphaMaskApplicator<
quint8, 4, 3, _impl,
- typename std::enable_if<_impl != Vc::ScalarImpl>::type> : public KoAlphaMaskApplicatorBase
+ typename std::enable_if<!std::is_same<_impl, xsimd::generic>::value>::type> : public KoAlphaMaskApplicatorBase
{
using uint_v = typename KoStreamedMath<_impl>::uint_v;
using int_v = typename KoStreamedMath<_impl>::int_v;
-
+ using float_v = typename KoStreamedMath<_impl>::float_v;
static constexpr int numChannels = 4;
static constexpr int alphaPos = 3;
@@ -69,27 +70,26 @@ struct KoAlphaMaskApplicator<
const float *alpha,
qint32 nPixels) const override
{
- const int block1 = nPixels / static_cast<int>(Vc::float_v::size());
- const int block2 = nPixels % static_cast<int>(Vc::float_v::size());
- const int vectorPixelStride = numChannels * static_cast<int>(Vc::float_v::size());
+ const int block1 = nPixels / static_cast<int>(float_v::size);
+ const int block2 = nPixels % static_cast<int>(float_v::size);
+ const int vectorPixelStride = numChannels * static_cast<int>(float_v::size);
for (int i = 0; i < block1; i++) {
- Vc::float_v maskAlpha(alpha, Vc::Unaligned);
+ auto maskAlpha = float_v::load_unaligned(alpha);
- uint_v data_i;
- data_i.load(reinterpret_cast<const quint32*>(pixels), Vc::Unaligned);
+ auto data_i = uint_v::load_unaligned(reinterpret_cast<const quint32 *>(pixels));
- Vc::float_v pixelAlpha = Vc::simd_cast<Vc::float_v>(data_i >> 24U);
- pixelAlpha *= Vc::float_v(1.0f) - maskAlpha;
+ auto pixelAlpha = xsimd::to_float(xsimd::bitwise_cast<int_v>(data_i >> 24U));
+ pixelAlpha *= float_v(1.0f) - maskAlpha;
const quint32 colorChannelsMask = 0x00FFFFFF;
- uint_v pixelAlpha_i = uint_v(int_v(Vc::round(pixelAlpha)));
+ uint_v pixelAlpha_i = xsimd::bitwise_cast<uint_v>(xsimd::nearbyint_as_int(pixelAlpha));
data_i = (data_i & colorChannelsMask) | (pixelAlpha_i << 24);
- data_i.store(reinterpret_cast<quint32 *>(pixels), Vc::Unaligned);
+ data_i.store_unaligned(reinterpret_cast<typename uint_v::value_type *>(pixels));
pixels += vectorPixelStride;
- alpha += Vc::float_v::size();
+ alpha += float_v::size;
}
KoColorSpaceTrait<quint8, 4, 3>::
@@ -100,21 +100,21 @@ struct KoAlphaMaskApplicator<
const float * alpha,
const quint8 *brushColor,
qint32 nPixels) const override {
- const int block1 = nPixels / static_cast<int>(Vc::float_v::size());
- const int block2 = nPixels % static_cast<int>(Vc::float_v::size());
- const int vectorPixelStride = numChannels * static_cast<int>(Vc::float_v::size());
+ const int block1 = nPixels / static_cast<int>(float_v::size);
+ const int block2 = nPixels % static_cast<int>(float_v::size);
+ const int vectorPixelStride = numChannels * static_cast<int>(float_v::size);
const uint_v brushColor_i(*reinterpret_cast<const quint32*>(brushColor) & 0x00FFFFFFu);
for (int i = 0; i < block1; i++) {
- Vc::float_v maskAlpha(alpha, Vc::Unaligned);
- Vc::float_v pixelAlpha = Vc::float_v(255.0f) * (Vc::float_v(1.0f) - maskAlpha);
+ auto maskAlpha = float_v::load_unaligned(alpha);
+ auto pixelAlpha = float_v(255.0f) * (float_v(1.0f) - maskAlpha);
- uint_v pixelAlpha_i = uint_v(int_v(Vc::round(pixelAlpha)));
+ uint_v pixelAlpha_i = xsimd::bitwise_cast<uint_v>(xsimd::nearbyint_as_int(pixelAlpha));
uint_v data_i = brushColor_i | (pixelAlpha_i << 24);
- data_i.store(reinterpret_cast<quint32 *>(pixels), Vc::Unaligned);
+ data_i.store_unaligned(reinterpret_cast<typename uint_v::value_type *>(pixels));
pixels += vectorPixelStride;
- alpha += Vc::float_v::size();
+ alpha += float_v::size;
}
KoColorSpaceTrait<quint8, 4, 3>::
@@ -128,26 +128,25 @@ struct KoAlphaMaskApplicator<
}
void fillGrayBrushWithColor(quint8 *dst, const QRgb *brush, quint8 *brushColor, qint32 nPixels) const override {
- const int block1 = nPixels / static_cast<int>(Vc::float_v::size());
- const int block2 = nPixels % static_cast<int>(Vc::float_v::size());
- const int vectorPixelStride = numChannels * static_cast<int>(Vc::float_v::size());
+ const int block1 = nPixels / static_cast<int>(float_v::size);
+ const int block2 = nPixels % static_cast<int>(float_v::size);
+ const int vectorPixelStride = numChannels * static_cast<int>(float_v::size);
const uint_v brushColor_i(*reinterpret_cast<const quint32*>(brushColor) & 0x00FFFFFFu);
const uint_v redChannelMask(0xFF);
for (int i = 0; i < block1; i++) {
- uint_v maskPixels;
- maskPixels.load(reinterpret_cast<const quint32*>(brush), Vc::Unaligned);
+ const auto maskPixels = uint_v::load_unaligned(reinterpret_cast<const quint32*>(brush));
const uint_v pixelAlpha = maskPixels >> 24;
const uint_v pixelRed = maskPixels & redChannelMask;
const uint_v pixelAlpha_i = multiply(redChannelMask - pixelRed, pixelAlpha);
const uint_v data_i = brushColor_i | (pixelAlpha_i << 24);
- data_i.store(reinterpret_cast<quint32 *>(dst), Vc::Unaligned);
+ data_i.store_unaligned(reinterpret_cast<typename uint_v::value_type *>(dst));
dst += vectorPixelStride;
- brush += Vc::float_v::size();
+ brush += float_v::size;
}
KoColorSpaceTrait<quint8, 4, 3>::
@@ -155,6 +154,6 @@ struct KoAlphaMaskApplicator<
}
};
-#endif /* HAVE_VC */
+#endif /* HAVE_XSIMD */
#endif // KOALPHAMASKAPPLICATOR_H
diff --git a/libs/pigment/KoAlphaMaskApplicatorFactoryImpl.cpp b/libs/pigment/KoAlphaMaskApplicatorFactoryImpl.cpp
index 702cf62add..d9c5ac2edc 100644
--- a/libs/pigment/KoAlphaMaskApplicatorFactoryImpl.cpp
+++ b/libs/pigment/KoAlphaMaskApplicatorFactoryImpl.cpp
@@ -15,7 +15,7 @@
template<typename _channels_type_,
int _channels_nb_,
int _alpha_pos_>
-template<Vc::Implementation _impl>
+template<typename _impl>
KoAlphaMaskApplicatorBase*
KoAlphaMaskApplicatorFactoryImpl<_channels_type_, _channels_nb_, _alpha_pos_>::create(int)
{
@@ -25,30 +25,30 @@ KoAlphaMaskApplicatorFactoryImpl<_channels_type_, _channels_nb_, _alpha_pos_>::c
_impl>();
}
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint8, 4, 3>::create<Vc::CurrentImplementation::current()>(int);
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint16, 4, 3>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint8, 4, 3>::create<xsimd::current_arch>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint16, 4, 3>::create<xsimd::current_arch>(int);
#ifdef HAVE_OPENEXR
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<half, 4, 3>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<half, 4, 3>::create<xsimd::current_arch>(int);
#endif
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<float, 4, 3>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<float, 4, 3>::create<xsimd::current_arch>(int);
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint8, 5, 4>::create<Vc::CurrentImplementation::current()>(int);
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint16, 5, 4>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint8, 5, 4>::create<xsimd::current_arch>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint16, 5, 4>::create<xsimd::current_arch>(int);
#ifdef HAVE_OPENEXR
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<half, 5, 4>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<half, 5, 4>::create<xsimd::current_arch>(int);
#endif
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<float, 5, 4>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<float, 5, 4>::create<xsimd::current_arch>(int);
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint8, 2, 1>::create<Vc::CurrentImplementation::current()>(int);
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint16, 2, 1>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint8, 2, 1>::create<xsimd::current_arch>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint16, 2, 1>::create<xsimd::current_arch>(int);
#ifdef HAVE_OPENEXR
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<half, 2, 1>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<half, 2, 1>::create<xsimd::current_arch>(int);
#endif
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<float, 2, 1>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<float, 2, 1>::create<xsimd::current_arch>(int);
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint8, 1, 0>::create<Vc::CurrentImplementation::current()>(int);
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint16, 1, 0>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint8, 1, 0>::create<xsimd::current_arch>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<quint16, 1, 0>::create<xsimd::current_arch>(int);
#ifdef HAVE_OPENEXR
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<half, 1, 0>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<half, 1, 0>::create<xsimd::current_arch>(int);
#endif
-template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<float, 1, 0>::create<Vc::CurrentImplementation::current()>(int);
+template KoAlphaMaskApplicatorBase* KoAlphaMaskApplicatorFactoryImpl<float, 1, 0>::create<xsimd::current_arch>(int);
diff --git a/libs/pigment/KoAlphaMaskApplicatorFactoryImpl.h b/libs/pigment/KoAlphaMaskApplicatorFactoryImpl.h
index c0fae4dcf7..4368987661 100644
--- a/libs/pigment/KoAlphaMaskApplicatorFactoryImpl.h
+++ b/libs/pigment/KoAlphaMaskApplicatorFactoryImpl.h
@@ -7,7 +7,7 @@
#define KOALPHAMASKAPPLICATORFACTORYIMPL_H
#include <KoAlphaMaskApplicatorBase.h>
-#include <KoVcMultiArchBuildSupport.h>
+#include <KoMultiArchBuildSupport.h>
template<typename _channels_type_,
int _channels_nb_,
@@ -18,7 +18,7 @@ public:
typedef int ParamType;
typedef KoAlphaMaskApplicatorBase* ReturnType;
- template<Vc::Implementation _impl>
+ template<typename _impl>
static KoAlphaMaskApplicatorBase* create(int);
};
diff --git a/libs/pigment/KoOptimizedPixelDataScalerU8ToU16.h b/libs/pigment/KoOptimizedPixelDataScalerU8ToU16.h
index 36002f6c9f..06fdc9308f 100644
--- a/libs/pigment/KoOptimizedPixelDataScalerU8ToU16.h
+++ b/libs/pigment/KoOptimizedPixelDataScalerU8ToU16.h
@@ -9,7 +9,7 @@
#include "KoOptimizedPixelDataScalerU8ToU16Base.h"
-#include "KoVcMultiArchBuildSupport.h"
+#include "KoMultiArchBuildSupport.h"
#include "kis_debug.h"
#if defined(__i386__) || defined(__x86_64__)
@@ -17,7 +17,7 @@
#endif
-template<Vc::Implementation _impl>
+template<typename _impl = xsimd::current_arch>
class KoOptimizedPixelDataScalerU8ToU16 : public KoOptimizedPixelDataScalerU8ToU16Base
{
public:
diff --git a/libs/pigment/KoOptimizedPixelDataScalerU8ToU16FactoryImpl.cpp b/libs/pigment/KoOptimizedPixelDataScalerU8ToU16FactoryImpl.cpp
index a0f43ebb11..ef214dd79e 100644
--- a/libs/pigment/KoOptimizedPixelDataScalerU8ToU16FactoryImpl.cpp
+++ b/libs/pigment/KoOptimizedPixelDataScalerU8ToU16FactoryImpl.cpp
@@ -8,10 +8,11 @@
#include "KoOptimizedPixelDataScalerU8ToU16.h"
-template<Vc::Implementation _impl>
+template<typename _impl>
KoOptimizedPixelDataScalerU8ToU16Base *KoOptimizedPixelDataScalerU8ToU16FactoryImpl::create(int channelsPerPixel)
{
return new KoOptimizedPixelDataScalerU8ToU16<_impl>(channelsPerPixel);
}
-template KoOptimizedPixelDataScalerU8ToU16Base *KoOptimizedPixelDataScalerU8ToU16FactoryImpl::create<Vc::CurrentImplementation::current()>(int);
+template KoOptimizedPixelDataScalerU8ToU16Base *
+KoOptimizedPixelDataScalerU8ToU16FactoryImpl::create<xsimd::current_arch>(int);
diff --git a/libs/pigment/KoOptimizedPixelDataScalerU8ToU16FactoryImpl.h b/libs/pigment/KoOptimizedPixelDataScalerU8ToU16FactoryImpl.h
index ad22efdaf7..c2762def1c 100644
--- a/libs/pigment/KoOptimizedPixelDataScalerU8ToU16FactoryImpl.h
+++ b/libs/pigment/KoOptimizedPixelDataScalerU8ToU16FactoryImpl.h
@@ -8,7 +8,7 @@
#define KoOptimizedPixelDataScalerU8ToU16FACTORYIMPL_H
#include <KoOptimizedPixelDataScalerU8ToU16Base.h>
-#include <KoVcMultiArchBuildSupport.h>
+#include <KoMultiArchBuildSupport.h>
class KRITAPIGMENT_EXPORT KoOptimizedPixelDataScalerU8ToU16FactoryImpl
{
@@ -16,7 +16,7 @@ public:
typedef int ParamType;
typedef KoOptimizedPixelDataScalerU8ToU16Base* ReturnType;
- template<Vc::Implementation _impl>
+ template<typename _impl>
static KoOptimizedPixelDataScalerU8ToU16Base* create(int);
};
diff --git a/libs/pigment/compositeops/KoMultiArchBuildSupport.h b/libs/pigment/compositeops/KoMultiArchBuildSupport.h
new file mode 100644
index 0000000000..af02048995
--- /dev/null
+++ b/libs/pigment/compositeops/KoMultiArchBuildSupport.h
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73 at gmail.com>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ */
+
+#ifndef KO_MULTI_ARCH_BUILD_SUPPORT_H
+#define KO_MULTI_ARCH_BUILD_SUPPORT_H
+
+
+#include <QDebug>
+#include <ksharedconfig.h>
+#include <kconfig.h>
+#include <kconfiggroup.h>
+#include <xsimd_extensions/xsimd.hpp>
+
+template<class FactoryType>
+typename FactoryType::ReturnType
+createOptimizedClass(typename FactoryType::ParamType param)
+{
+#ifdef HAVE_XSIMD
+ const auto best_arch = xsimd::available_architectures().best;
+
+#ifdef Q_PROCESSOR_X86
+ static bool isConfigInitialized = false;
+ static bool useVectorization = true;
+ static bool disableAVXOptimizations = false;
+
+ if (!isConfigInitialized) {
+ KConfigGroup cfg = KSharedConfig::openConfig()->group("");
+ // use the old key name for compatibility
+ useVectorization = !cfg.readEntry("amdDisableVectorWorkaround", false);
+ disableAVXOptimizations = cfg.readEntry("disableAVXOptimizations", false);
+ }
+
+ if (!useVectorization) {
+ qWarning() << "WARNING: vector instructions disabled by the \'amdDisableVectorWorkaround\' option!";
+ return FactoryType::template create<xsimd::generic>(param);
+ }
+
+ if (disableAVXOptimizations
+ && (xsimd::avx::version() <= best_arch || xsimd::avx2::version() <= best_arch)) {
+ qWarning() << "WARNING: AVX and AVX2 optimizations are disabled by the \'disableAVXOptimizations\' option!";
+ }
+
+ /**
+ * We use SSE2, SSSE3, SSE4.1, AVX and AVX2+FMA.
+ * The rest are integer and string instructions mostly.
+ */
+ if (!disableAVXOptimizations && xsimd::fma3<xsimd::avx2>::version() <= best_arch) {
+ return FactoryType::template create<xsimd::fma3<xsimd::avx2>>(param);
+ } else if (!disableAVXOptimizations && xsimd::avx::version() <= best_arch) {
+ return FactoryType::template create<xsimd::avx>(param);
+ } else if (xsimd::sse4_1::version() <= best_arch) {
+ return FactoryType::template create<xsimd::sse4_1>(param);
+ } else if (xsimd::ssse3::version() <= best_arch) {
+ return FactoryType::template create<xsimd::ssse3>(param);
+ } else if (xsimd::sse2::version() <= best_arch) {
+ return FactoryType::template create<xsimd::sse2>(param);
+ }
+#elif XSIMD_WITH_NEON64
+ if (xsimd::neon64::version() <= best_arch) {
+ return FactoryType::template create<xsimd::neon64>(param);
+ }
+#elif XSIMD_WITH_NEON
+ if (xsimd::neon::version() <= best_arch) {
+ return FactoryType::template create<xsimd::neon>(param);
+ }
+#endif // XSIMD_WITH_SSE2
+#endif // HAVE_XSIMD
+
+ return FactoryType::template create<xsimd::generic>(param);
+}
+
+template<class FactoryType>
+typename FactoryType::ReturnType
+createOptimizedClass(typename FactoryType::ParamType param, bool forceScalarImplemetation)
+{
+ if(forceScalarImplemetation){
+ return FactoryType::template create<xsimd::generic>(param);
+ }
+ return createOptimizedClass<FactoryType>(param);
+}
+
+#endif /* KO_MULTI_ARCH_BUILD_SUPPORT_H */
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken128.h b/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken128.h
index 994852e785..e7235e189c 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken128.h
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken128.h
@@ -1,5 +1,6 @@
/*
* SPDX-FileCopyrightText: 2016 Thorsten Zachmann <zachmann at kde.org>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: LGPL-2.0-or-later
*/
@@ -26,7 +27,7 @@ struct AlphaDarkenCompositor128 {
/**
* This is a vector equivalent of compositeOnePixelScalar(). It is considered
- * to process Vc::float_v::size() pixels in a single pass.
+ * to process float_v::size pixels in a single pass.
*
* o the \p haveMask parameter points whether the real (non-null) mask
* pointer is passed to the function.
@@ -35,23 +36,26 @@ struct AlphaDarkenCompositor128 {
* \p src_aligned.
* o the \p dst pointer must always(!) be aligned to the boundary
* of a streaming vector. Unaligned writes are really expensive.
- * o This function is *never* used if HAVE_VC is not present
+ * o This function is *never* used if HAVE_XSIMD is not present
*/
- template<bool haveMask, bool src_aligned, Vc::Implementation _impl>
+ template<bool haveMask, bool src_aligned, typename _impl>
static ALWAYS_INLINE void compositeVector(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, const ParamsWrapper &oparams)
{
- Vc::float_v src_c1;
- Vc::float_v src_c2;
- Vc::float_v src_c3;
- Vc::float_v src_alpha;
+ using float_v = typename KoStreamedMath<_impl>::float_v;
+ using float_m = typename float_v::batch_bool_type;
+
+ float_v src_c1;
+ float_v src_c2;
+ float_v src_c3;
+ float_v src_alpha;
PixelWrapper<channels_type, _impl> dataWrapper;
dataWrapper.read(const_cast<quint8*>(src), src_c1, src_c2, src_c3, src_alpha);
- Vc::float_v msk_norm_alpha;
+ float_v msk_norm_alpha;
if (haveMask) {
- const Vc::float_v uint8Rec1(1.0f / 255.0f);
- Vc::float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
+ const float_v uint8Rec1(1.0f / 255.0f);
+ float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
msk_norm_alpha = mask_vec * uint8Rec1 * src_alpha;
}
else {
@@ -63,35 +67,31 @@ struct AlphaDarkenCompositor128 {
// instead we use value calculated by ParamsWrapper
opacity = oparams.opacity;
- Vc::float_v opacity_vec(opacity);
+ float_v opacity_vec(opacity);
src_alpha = msk_norm_alpha * opacity_vec;
- const Vc::float_v zeroValue(static_cast<float>(KoColorSpaceMathsTraits<channels_type>::zeroValue));
+ const float_v zeroValue(static_cast<float>(KoColorSpaceMathsTraits<channels_type>::zeroValue));
- Vc::float_v dst_c1;
- Vc::float_v dst_c2;
- Vc::float_v dst_c3;
- Vc::float_v dst_alpha;
+ float_v dst_c1;
+ float_v dst_c2;
+ float_v dst_c3;
+ float_v dst_alpha;
dataWrapper.read(dst, dst_c1, dst_c2, dst_c3, dst_alpha);
- Vc::float_m empty_dst_pixels_mask = dst_alpha == zeroValue;
+ float_m empty_dst_pixels_mask = dst_alpha == zeroValue;
- if (!empty_dst_pixels_mask.isFull()) {
- if (empty_dst_pixels_mask.isEmpty()) {
+ if (!xsimd::all(empty_dst_pixels_mask)) {
+ if (xsimd::none(empty_dst_pixels_mask)) {
dst_c1 = (src_c1 - dst_c1) * src_alpha + dst_c1;
dst_c2 = (src_c2 - dst_c2) * src_alpha + dst_c2;
dst_c3 = (src_c3 - dst_c3) * src_alpha + dst_c3;
}
else {
- dst_c1(empty_dst_pixels_mask) = src_c1;
- dst_c2(empty_dst_pixels_mask) = src_c2;
- dst_c3(empty_dst_pixels_mask) = src_c3;
- Vc::float_m not_empty_dst_pixels_mask = !empty_dst_pixels_mask;
- dst_c1(not_empty_dst_pixels_mask) = (src_c1 - dst_c1) * src_alpha + dst_c1;
- dst_c2(not_empty_dst_pixels_mask) = (src_c2 - dst_c2) * src_alpha + dst_c2;
- dst_c3(not_empty_dst_pixels_mask) = (src_c3 - dst_c3) * src_alpha + dst_c3;
+ dst_c1 = xsimd::select(empty_dst_pixels_mask, src_c1, (src_c1 - dst_c1) * src_alpha + dst_c1);
+ dst_c2 = xsimd::select(empty_dst_pixels_mask, src_c2, (src_c2 - dst_c2) * src_alpha + dst_c2);
+ dst_c3 = xsimd::select(empty_dst_pixels_mask, src_c3, (src_c3 - dst_c3) * src_alpha + dst_c3);
}
}
else {
@@ -100,24 +100,24 @@ struct AlphaDarkenCompositor128 {
dst_c3 = src_c3;
}
- Vc::float_v fullFlowAlpha(dst_alpha);
+ float_v fullFlowAlpha(dst_alpha);
if (oparams.averageOpacity > opacity) {
- Vc::float_v average_opacity_vec(oparams.averageOpacity);
- Vc::float_m fullFlowAlpha_mask = average_opacity_vec > dst_alpha;
- fullFlowAlpha(fullFlowAlpha_mask) = (average_opacity_vec - src_alpha) * (dst_alpha / average_opacity_vec) + src_alpha;
+ float_v average_opacity_vec(oparams.averageOpacity);
+ float_m fullFlowAlpha_mask = average_opacity_vec > dst_alpha;
+ fullFlowAlpha = xsimd::select(fullFlowAlpha_mask,(average_opacity_vec - src_alpha) * (dst_alpha / average_opacity_vec) + src_alpha, fullFlowAlpha);
}
else {
- Vc::float_m fullFlowAlpha_mask = opacity_vec > dst_alpha;
- fullFlowAlpha(fullFlowAlpha_mask) = (opacity_vec - dst_alpha) * msk_norm_alpha + dst_alpha;
+ float_m fullFlowAlpha_mask = opacity_vec > dst_alpha;
+ fullFlowAlpha = xsimd::select(fullFlowAlpha_mask, (opacity_vec - dst_alpha) * msk_norm_alpha + dst_alpha, fullFlowAlpha);
}
if (oparams.flow == 1.0) {
dst_alpha = fullFlowAlpha;
}
else {
- Vc::float_v zeroFlowAlpha = ParamsWrapper::calculateZeroFlowAlpha(src_alpha, dst_alpha);
- Vc::float_v flow_norm_vec(oparams.flow);
+ float_v zeroFlowAlpha = ParamsWrapper::calculateZeroFlowAlpha(src_alpha, dst_alpha);
+ float_v flow_norm_vec(oparams.flow);
dst_alpha = (fullFlowAlpha - zeroFlowAlpha) * flow_norm_vec + zeroFlowAlpha;
}
@@ -127,7 +127,7 @@ struct AlphaDarkenCompositor128 {
/**
* Composes one pixel of the source into the destination
*/
- template <bool haveMask, Vc::Implementation _impl>
+ template <bool haveMask, typename _impl>
static ALWAYS_INLINE void compositeOnePixelScalar(const quint8 *s, quint8 *d, const quint8 *mask, float opacity, const ParamsWrapper &oparams)
{
using namespace Arithmetic;
@@ -186,7 +186,7 @@ struct AlphaDarkenCompositor128 {
* colorspaces with alpha channel placed at the last byte of
* the pixel: C1_C2_C3_A.
*/
-template<Vc::Implementation _impl, typename ParamsWrapper>
+template<typename _impl, typename ParamsWrapper>
class KoOptimizedCompositeOpAlphaDarken128Impl : public KoCompositeOp
{
public:
@@ -195,7 +195,7 @@ public:
using KoCompositeOp::composite;
- virtual void composite(const KoCompositeOp::ParameterInfo& params) const override
+ void composite(const KoCompositeOp::ParameterInfo& params) const override
{
if(params.maskRowStart) {
KoStreamedMath<_impl>::template genericComposite128<true, true, AlphaDarkenCompositor128<float, ParamsWrapper> >(params);
@@ -205,7 +205,7 @@ public:
}
};
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpAlphaDarkenHard128
: public KoOptimizedCompositeOpAlphaDarken128Impl<_impl, KoAlphaDarkenParamsWrapperHard>
{
@@ -214,7 +214,7 @@ public:
: KoOptimizedCompositeOpAlphaDarken128Impl<_impl, KoAlphaDarkenParamsWrapperHard>(cs) {}
};
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpAlphaDarkenCreamy128
: public KoOptimizedCompositeOpAlphaDarken128Impl<_impl, KoAlphaDarkenParamsWrapperCreamy>
{
@@ -223,7 +223,7 @@ public:
: KoOptimizedCompositeOpAlphaDarken128Impl<_impl, KoAlphaDarkenParamsWrapperCreamy>(cs) {}
};
-template<Vc::Implementation _impl, typename ParamsWrapper>
+template<typename _impl, typename ParamsWrapper>
class KoOptimizedCompositeOpAlphaDarkenU64Impl : public KoCompositeOp
{
public:
@@ -232,7 +232,7 @@ public:
using KoCompositeOp::composite;
- virtual void composite(const KoCompositeOp::ParameterInfo& params) const override
+ void composite(const KoCompositeOp::ParameterInfo& params) const override
{
if(params.maskRowStart) {
KoStreamedMath<_impl>::template genericComposite64<true, true, AlphaDarkenCompositor128<quint16, ParamsWrapper> >(params);
@@ -242,7 +242,7 @@ public:
}
};
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpAlphaDarkenHardU64
: public KoOptimizedCompositeOpAlphaDarkenU64Impl<_impl, KoAlphaDarkenParamsWrapperHard>
{
@@ -251,7 +251,7 @@ public:
: KoOptimizedCompositeOpAlphaDarkenU64Impl<_impl, KoAlphaDarkenParamsWrapperHard>(cs) {}
};
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpAlphaDarkenCreamyU64
: public KoOptimizedCompositeOpAlphaDarkenU64Impl<_impl, KoAlphaDarkenParamsWrapperCreamy>
{
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h b/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
index a45ac30805..5d57f02409 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpAlphaDarken32.h
@@ -1,6 +1,7 @@
/*
* SPDX-FileCopyrightText: 2006 Cyrille Berger <cberger at cberger.net>
* SPDX-FileCopyrightText: 2011 Silvio Heinrich <plassy at web.de>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: LGPL-2.0-or-later
*/
@@ -20,7 +21,7 @@ struct AlphaDarkenCompositor32 {
/**
* This is a vector equivalent of compositeOnePixelScalar(). It is considered
- * to process Vc::float_v::size() pixels in a single pass.
+ * to process float_v::size pixels in a single pass.
*
* o the \p haveMask parameter points whether the real (non-null) mask
* pointer is passed to the function.
@@ -29,36 +30,39 @@ struct AlphaDarkenCompositor32 {
* \p src_aligned.
* o the \p dst pointer must always(!) be aligned to the boundary
* of a streaming vector. Unaligned writes are really expensive.
- * o This function is *never* used if HAVE_VC is not present
+ * o This function is *never* used if HAVE_XSIMD is not present
*/
- template<bool haveMask, bool src_aligned, Vc::Implementation _impl>
+ template<bool haveMask, bool src_aligned, typename _impl>
static ALWAYS_INLINE void compositeVector(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, const ParamsWrapper &oparams)
{
- Vc::float_v src_alpha;
- Vc::float_v dst_alpha;
+ using float_v = typename KoStreamedMath<_impl>::float_v;
+ using float_m = typename float_v::batch_bool_type;
+
+ float_v src_alpha;
+ float_v dst_alpha;
// we don't use directly passed value
Q_UNUSED(opacity);
// instead we use value calculated by ParamsWrapper
opacity = oparams.opacity;
- Vc::float_v opacity_vec(255.0f * opacity);
+ float_v opacity_vec(255.0f * opacity);
- Vc::float_v average_opacity_vec(255.0 * oparams.averageOpacity);
- Vc::float_v flow_norm_vec(oparams.flow);
+ float_v average_opacity_vec(255.0 * oparams.averageOpacity);
+ float_v flow_norm_vec(oparams.flow);
- Vc::float_v uint8MaxRec2(1.0f / (255.0f * 255.0f));
- Vc::float_v uint8MaxRec1(1.0f / 255.0f);
- Vc::float_v uint8Max(255.0f);
- Vc::float_v zeroValue(Vc::Zero);
+ float_v uint8MaxRec2(1.0f / (255.0f * 255.0f));
+ float_v uint8MaxRec1(1.0f / 255.0f);
+ float_v uint8Max(255.0f);
+ float_v zeroValue(0);
- Vc::float_v msk_norm_alpha;
+ float_v msk_norm_alpha;
src_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<src_aligned>(src);
if (haveMask) {
- Vc::float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
+ float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
msk_norm_alpha = src_alpha * mask_vec * uint8MaxRec2;
} else {
msk_norm_alpha = src_alpha * uint8MaxRec1;
@@ -67,88 +71,80 @@ struct AlphaDarkenCompositor32 {
dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<true>(dst);
src_alpha = msk_norm_alpha * opacity_vec;
- Vc::float_m empty_dst_pixels_mask = dst_alpha == zeroValue;
+ float_m empty_dst_pixels_mask = dst_alpha == zeroValue;
- Vc::float_v src_c1;
- Vc::float_v src_c2;
- Vc::float_v src_c3;
+ float_v src_c1;
+ float_v src_c2;
+ float_v src_c3;
- Vc::float_v dst_c1;
- Vc::float_v dst_c2;
- Vc::float_v dst_c3;
+ float_v dst_c1;
+ float_v dst_c2;
+ float_v dst_c3;
KoStreamedMath<_impl>::template fetch_colors_32<src_aligned>(src, src_c1, src_c2, src_c3);
- bool srcAlphaIsZero = (src_alpha == zeroValue).isFull();
+ bool srcAlphaIsZero = xsimd::all(src_alpha == zeroValue);
if (srcAlphaIsZero) return;
- bool dstAlphaIsZero = empty_dst_pixels_mask.isFull();
+ bool dstAlphaIsZero = xsimd::all(empty_dst_pixels_mask);
- Vc::float_v dst_blend = src_alpha * uint8MaxRec1;
+ float_v dst_blend = src_alpha * uint8MaxRec1;
- bool srcAlphaIsUnit = (src_alpha == uint8Max).isFull();
+ bool srcAlphaIsUnit = xsimd::all(src_alpha == uint8Max);
if (dstAlphaIsZero) {
dst_c1 = src_c1;
dst_c2 = src_c2;
dst_c3 = src_c3;
} else if (srcAlphaIsUnit) {
- bool dstAlphaIsUnit = (dst_alpha == uint8Max).isFull();
+ bool dstAlphaIsUnit = xsimd::all(dst_alpha == uint8Max);
if (dstAlphaIsUnit) {
- memcpy(dst, src, 4 * Vc::float_v::size());
+ memcpy(dst, src, 4 * float_v::size);
return;
} else {
dst_c1 = src_c1;
dst_c2 = src_c2;
dst_c3 = src_c3;
}
- } else if (empty_dst_pixels_mask.isEmpty()) {
+ } else if (xsimd::none(empty_dst_pixels_mask)) {
KoStreamedMath<_impl>::template fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
dst_c1 = dst_blend * (src_c1 - dst_c1) + dst_c1;
dst_c2 = dst_blend * (src_c2 - dst_c2) + dst_c2;
dst_c3 = dst_blend * (src_c3 - dst_c3) + dst_c3;
} else {
KoStreamedMath<_impl>::template fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
- dst_c1(empty_dst_pixels_mask) = src_c1;
- dst_c2(empty_dst_pixels_mask) = src_c2;
- dst_c3(empty_dst_pixels_mask) = src_c3;
-
- Vc::float_m not_empty_dst_pixels_mask = !empty_dst_pixels_mask;
-
- dst_c1(not_empty_dst_pixels_mask) = dst_blend * (src_c1 - dst_c1) + dst_c1;
- dst_c2(not_empty_dst_pixels_mask) = dst_blend * (src_c2 - dst_c2) + dst_c2;
- dst_c3(not_empty_dst_pixels_mask) = dst_blend * (src_c3 - dst_c3) + dst_c3;
+ dst_c1 = xsimd::select(empty_dst_pixels_mask, src_c1, dst_blend * (src_c1 - dst_c1) + dst_c1);
+ dst_c2 = xsimd::select(empty_dst_pixels_mask, src_c2, dst_blend * (src_c2 - dst_c2) + dst_c2);
+ dst_c3 = xsimd::select(empty_dst_pixels_mask, src_c3, dst_blend * (src_c3 - dst_c3) + dst_c3);
}
- Vc::float_v fullFlowAlpha;
+ float_v fullFlowAlpha;
if (oparams.averageOpacity > opacity) {
- Vc::float_m fullFlowAlpha_mask = average_opacity_vec > dst_alpha;
+ float_m fullFlowAlpha_mask = average_opacity_vec > dst_alpha;
- if (fullFlowAlpha_mask.isEmpty()) {
+ if (xsimd::none(fullFlowAlpha_mask)) {
fullFlowAlpha = dst_alpha;
} else {
- Vc::float_v reverse_blend = dst_alpha / average_opacity_vec;
- Vc::float_v opt1 = (average_opacity_vec - src_alpha) * reverse_blend + src_alpha;
- fullFlowAlpha(!fullFlowAlpha_mask) = dst_alpha;
- fullFlowAlpha(fullFlowAlpha_mask) = opt1;
+ float_v reverse_blend = dst_alpha / average_opacity_vec;
+ float_v opt1 = (average_opacity_vec - src_alpha) * reverse_blend + src_alpha;
+ fullFlowAlpha = xsimd::select(fullFlowAlpha_mask, opt1, dst_alpha);
}
} else {
- Vc::float_m fullFlowAlpha_mask = opacity_vec > dst_alpha;
+ float_m fullFlowAlpha_mask = opacity_vec > dst_alpha;
- if (fullFlowAlpha_mask.isEmpty()) {
+ if (xsimd::none(fullFlowAlpha_mask)) {
fullFlowAlpha = dst_alpha;
} else {
- Vc::float_v opt1 = (opacity_vec - dst_alpha) * msk_norm_alpha + dst_alpha;
- fullFlowAlpha(!fullFlowAlpha_mask) = dst_alpha;
- fullFlowAlpha(fullFlowAlpha_mask) = opt1;
+ float_v opt1 = (opacity_vec - dst_alpha) * msk_norm_alpha + dst_alpha;
+ fullFlowAlpha = xsimd::select(fullFlowAlpha_mask, opt1, dst_alpha);
}
}
if (oparams.flow == 1.0) {
dst_alpha = fullFlowAlpha;
} else {
- Vc::float_v zeroFlowAlpha = ParamsWrapper::calculateZeroFlowAlpha(src_alpha, dst_alpha, uint8MaxRec1);
+ float_v zeroFlowAlpha = ParamsWrapper::calculateZeroFlowAlpha(src_alpha, dst_alpha, uint8MaxRec1);
dst_alpha = (fullFlowAlpha - zeroFlowAlpha) * flow_norm_vec + zeroFlowAlpha;
}
@@ -158,7 +154,7 @@ struct AlphaDarkenCompositor32 {
/**
* Composes one pixel of the source into the destination
*/
- template <bool haveMask, Vc::Implementation _impl>
+ template <bool haveMask, typename _impl>
static ALWAYS_INLINE void compositeOnePixelScalar(const channels_type *src, channels_type *dst, const quint8 *mask, float opacity, const ParamsWrapper &oparams)
{
using namespace Arithmetic;
@@ -224,7 +220,7 @@ struct AlphaDarkenCompositor32 {
* colorspaces with alpha channel placed at the last byte of
* the pixel: C1_C2_C3_A.
*/
-template<Vc::Implementation _impl, class ParamsWrapper>
+template<typename _impl, class ParamsWrapper>
class KoOptimizedCompositeOpAlphaDarken32Impl : public KoCompositeOp
{
public:
@@ -243,7 +239,7 @@ public:
}
};
-template<Vc::Implementation _impl>
+template<typename _impl = xsimd::current_arch>
class KoOptimizedCompositeOpAlphaDarkenHard32 :
public KoOptimizedCompositeOpAlphaDarken32Impl<_impl, KoAlphaDarkenParamsWrapperHard>
{
@@ -253,7 +249,7 @@ public:
}
};
-template<Vc::Implementation _impl>
+template<typename _impl = xsimd::current_arch>
class KoOptimizedCompositeOpAlphaDarkenCreamy32 :
public KoOptimizedCompositeOpAlphaDarken32Impl<_impl, KoAlphaDarkenParamsWrapperCreamy>
{
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpCopy128.h b/libs/pigment/compositeops/KoOptimizedCompositeOpCopy128.h
index e89f43fe81..7d37591313 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpCopy128.h
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpCopy128.h
@@ -1,5 +1,6 @@
/*
* SPDX-FileCopyrightText: 2021 Dmitry Kazakov <dimula73 at gmail.com>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: LGPL-2.0-or-later
*/
@@ -31,54 +32,57 @@ struct CopyCompositor128 {
channels_type alpha;
};
- template<bool haveMask, bool src_aligned, Vc::Implementation _impl>
+ template<bool haveMask, bool src_aligned, typename _impl>
static ALWAYS_INLINE void compositeVector(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, const ParamsWrapper &oparams)
{
Q_UNUSED(oparams);
- Vc::float_v src_alpha;
- Vc::float_v src_c1;
- Vc::float_v src_c2;
- Vc::float_v src_c3;
+ using float_v = typename KoStreamedMath<_impl>::float_v;
+ using float_m = typename float_v::batch_bool_type;
+
+ float_v src_alpha;
+ float_v src_c1;
+ float_v src_c2;
+ float_v src_c3;
PixelWrapper<channels_type, _impl> dataWrapper;
dataWrapper.read(const_cast<quint8*>(src), src_c1, src_c2, src_c3, src_alpha);
- Vc::float_v opacity_norm_vec(opacity);
+ float_v opacity_norm_vec(opacity);
if (haveMask) {
- const Vc::float_v uint8MaxRec1(1.0f / 255.0f);
- Vc::float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
+ const float_v uint8MaxRec1(1.0f / 255.0f);
+ float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
opacity_norm_vec *= mask_vec * uint8MaxRec1;
}
- const Vc::float_v zeroValue(0.0f);
- const Vc::float_v oneValue(1.0f);
+ const float_v zeroValue(0.0f);
+ const float_v oneValue(1.0f);
- const Vc::float_m opacity_is_null_mask = opacity_norm_vec == zeroValue;
+ const float_m opacity_is_null_mask = opacity_norm_vec == zeroValue;
// The source cannot change the colors in the destination,
// since its fully transparent
- if (opacity_is_null_mask.isFull()) {
+ if (xsimd::all(opacity_is_null_mask)) {
// noop
- } else if ((opacity_norm_vec == oneValue).isFull()) {
- if ((src_alpha == zeroValue).isFull()) {
+ } else if (xsimd::all(opacity_norm_vec == oneValue)) {
+ if (xsimd::all(src_alpha == zeroValue)) {
dataWrapper.clearPixels(dst);
} else {
dataWrapper.copyPixels(src, dst);
}
} else {
- Vc::float_v dst_alpha;
- Vc::float_v dst_c1;
- Vc::float_v dst_c2;
- Vc::float_v dst_c3;
+ float_v dst_alpha;
+ float_v dst_c1;
+ float_v dst_c2;
+ float_v dst_c3;
dataWrapper.read(dst, dst_c1, dst_c2, dst_c3, dst_alpha);
- Vc::float_v newAlpha = dst_alpha + opacity_norm_vec * (src_alpha - dst_alpha);
+ float_v newAlpha = dst_alpha + opacity_norm_vec * (src_alpha - dst_alpha);
- if ((newAlpha == zeroValue).isFull()) {
+ if (xsimd::all(newAlpha == zeroValue)) {
dataWrapper.clearPixels(dst);
} else {
PixelStateRecoverHelper<channels_type, _impl> pixelRecoverHelper(dst_c1, dst_c2, dst_c3);
@@ -95,12 +99,12 @@ struct CopyCompositor128 {
dst_c2 += opacity_norm_vec * (src_c2 - dst_c2);
dst_c3 += opacity_norm_vec * (src_c3 - dst_c3);
- if (!(newAlpha == oneValue).isFull()) {
+ if (!xsimd::all(newAlpha == oneValue)) {
/// This division by newAlpha may be unsafe in case
/// **some** elements of newAlpha are null. We don't
/// care, because:
///
- /// 1) the value will be clamped by Vc::min a bit later;
+ /// 1) the value will be clamped by xsimd::min a bit later;
///
/// 2) even if it doesn't, the new alpha will be null,
/// so the state of the color channels is undefined
@@ -109,11 +113,11 @@ struct CopyCompositor128 {
dst_c2 /= newAlpha;
dst_c3 /= newAlpha;
- Vc::float_v unitValue(KoColorSpaceMathsTraits<channels_type>::unitValue);
+ float_v unitValue(KoColorSpaceMathsTraits<channels_type>::unitValue);
- dst_c1 = Vc::min(dst_c1, unitValue);
- dst_c2 = Vc::min(dst_c2, unitValue);
- dst_c3 = Vc::min(dst_c3, unitValue);
+ dst_c1 = xsimd::min(dst_c1, unitValue);
+ dst_c2 = xsimd::min(dst_c2, unitValue);
+ dst_c3 = xsimd::min(dst_c3, unitValue);
}
/**
@@ -129,7 +133,7 @@ struct CopyCompositor128 {
}
}
- template <bool haveMask, Vc::Implementation _impl>
+ template <bool haveMask, typename _impl = xsimd::current_arch>
static ALWAYS_INLINE void compositeOnePixelScalar(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, const ParamsWrapper &oparams)
{
using namespace Arithmetic;
@@ -267,7 +271,7 @@ struct CopyCompositor128 {
* colorspaces with alpha channel placed at the last byte of
* the pixel: C1_C2_C3_A.
*/
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpCopy128 : public KoCompositeOp
{
public:
@@ -276,7 +280,7 @@ public:
using KoCompositeOp::composite;
- virtual void composite(const KoCompositeOp::ParameterInfo& params) const
+ void composite(const KoCompositeOp::ParameterInfo& params) const override
{
if(params.maskRowStart) {
composite<true>(params);
@@ -311,7 +315,7 @@ public:
}
};
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpCopyU64 : public KoCompositeOp
{
public:
@@ -320,7 +324,7 @@ public:
using KoCompositeOp::composite;
- virtual void composite(const KoCompositeOp::ParameterInfo& params) const
+ void composite(const KoCompositeOp::ParameterInfo& params) const override
{
if(params.maskRowStart) {
composite<true>(params);
@@ -356,7 +360,7 @@ public:
};
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpCopy32 : public KoCompositeOp
{
public:
@@ -365,7 +369,7 @@ public:
using KoCompositeOp::composite;
- virtual void composite(const KoCompositeOp::ParameterInfo& params) const
+ void composite(const KoCompositeOp::ParameterInfo& params) const override
{
if(params.maskRowStart) {
composite<true>(params);
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp
index a748111889..147fd9f018 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.cpp
@@ -1,13 +1,10 @@
/*
* SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73 at gmail.com>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: LGPL-2.1-or-later
*/
-#if !defined _MSC_VER
-#pragma GCC diagnostic ignored "-Wundef"
-#endif
-
#include "KoOptimizedCompositeOpFactoryPerArch.h"
#include "KoOptimizedCompositeOpAlphaDarken32.h"
#include "KoOptimizedCompositeOpAlphaDarken128.h"
@@ -20,104 +17,100 @@
#include <KoCompositeOpRegistry.h>
-#if defined(__clang__)
-#pragma GCC diagnostic ignored "-Wlocal-type-template-args"
-#endif
-
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard32>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard32>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard32>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpAlphaDarkenHard32<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpAlphaDarkenHard32<xsimd::current_arch>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy32>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy32>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy32>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpAlphaDarkenCreamy32<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpAlphaDarkenCreamy32<xsimd::current_arch>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver32>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver32>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver32>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpOver32<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpOver32<xsimd::current_arch>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard128>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard128>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard128>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpAlphaDarkenHard128<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpAlphaDarkenHard128<xsimd::current_arch>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy128>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy128>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy128>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpAlphaDarkenCreamy128<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpAlphaDarkenCreamy128<xsimd::current_arch>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpOver128<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpOver128<xsimd::current_arch>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOverU64>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOverU64>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOverU64>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpOverU64<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpOverU64<xsimd::current_arch>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy128>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy128>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy128>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpCopy128<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpCopy128<xsimd::current_arch>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopyU64>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopyU64>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopyU64>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpCopyU64<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpCopyU64<xsimd::current_arch>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy32>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy32>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy32>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpCopy32<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpCopy32<xsimd::current_arch>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHardU64>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHardU64>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHardU64>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpAlphaDarkenHardU64<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpAlphaDarkenHardU64<xsimd::current_arch>(param);
}
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamyU64>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamyU64>::create<Vc::CurrentImplementation::current()>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamyU64>::create<xsimd::current_arch>(ParamType param)
{
- return new KoOptimizedCompositeOpAlphaDarkenCreamyU64<Vc::CurrentImplementation::current()>(param);
+ return new KoOptimizedCompositeOpAlphaDarkenCreamyU64<xsimd::current_arch>(param);
}
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h
index ff36f47786..379eb89cfa 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch.h
@@ -1,5 +1,6 @@
/*
* SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73 at gmail.com>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: LGPL-2.1-or-later
*/
@@ -8,58 +9,56 @@
#define KOOPTIMIZEDCOMPOSITEOPFACTORYPERARCH_H
-#include <compositeops/KoVcMultiArchBuildSupport.h>
+#include <compositeops/KoMultiArchBuildSupport.h>
class KoCompositeOp;
class KoColorSpace;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpAlphaDarkenCreamy32;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpAlphaDarkenHard32;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpOver32;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpAlphaDarkenHard128;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpAlphaDarkenCreamy128;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpAlphaDarkenHardU64;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpAlphaDarkenCreamyU64;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpOver128;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpOverU64;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpCopy128;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpCopyU64;
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpCopy32;
-template<template<Vc::Implementation I> class CompositeOp>
-struct KoOptimizedCompositeOpFactoryPerArch
-{
- typedef const KoColorSpace* ParamType;
- typedef KoCompositeOp* ReturnType;
+template<template<typename I> class CompositeOp>
+struct KoOptimizedCompositeOpFactoryPerArch {
+ using ParamType = const KoColorSpace *;
+ using ReturnType = KoCompositeOp *;
- template<Vc::Implementation _impl>
+ template <typename _impl>
static ReturnType create(ParamType param);
};
-
#endif /* KOOPTIMIZEDCOMPOSITEOPFACTORYPERARCH_H */
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch_Scalar.cpp b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch_Scalar.cpp
index 7118af186b..4d3ae906b6 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch_Scalar.cpp
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpFactoryPerArch_Scalar.cpp
@@ -1,5 +1,6 @@
/*
* SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73 at gmail.com>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: LGPL-2.1-or-later
*/
@@ -15,7 +16,7 @@
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard32>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard32>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard32>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpAlphaDarken<KoBgrU8Traits, KoAlphaDarkenParamsWrapperHard>(param);
}
@@ -23,7 +24,7 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard32>::c
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy32>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy32>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy32>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpAlphaDarken<KoBgrU8Traits, KoAlphaDarkenParamsWrapperCreamy>(param);
}
@@ -31,7 +32,7 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy32>:
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver32>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver32>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver32>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpOver<KoBgrU8Traits>(param);
}
@@ -39,7 +40,7 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver32>::create<Vc::S
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard128>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard128>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard128>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpAlphaDarken<KoRgbF32Traits, KoAlphaDarkenParamsWrapperHard>(param);
}
@@ -47,7 +48,7 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHard128>::
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy128>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy128>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy128>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpAlphaDarken<KoRgbF32Traits, KoAlphaDarkenParamsWrapperCreamy>(param);
}
@@ -56,7 +57,7 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamy128>
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpOver<KoRgbF32Traits>(param);
}
@@ -64,7 +65,7 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOver128>::create<Vc::
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOverU64>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOverU64>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOverU64>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpOver<KoBgrU16Traits>(param);
}
@@ -72,7 +73,7 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpOverU64>::create<Vc::
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy128>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy128>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy128>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpCopy2<KoRgbF32Traits>(param);
}
@@ -80,7 +81,7 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy128>::create<Vc::
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopyU64>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopyU64>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopyU64>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpCopy2<KoBgrU16Traits>(param);
}
@@ -88,7 +89,7 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopyU64>::create<Vc::
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy32>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy32>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy32>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpCopy2<KoBgrU8Traits>(param);
}
@@ -96,7 +97,7 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpCopy32>::create<Vc::S
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHardU64>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHardU64>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHardU64>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpAlphaDarken<KoBgrU16Traits, KoAlphaDarkenParamsWrapperHard>(param);
}
@@ -104,7 +105,7 @@ KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenHardU64>::
template<>
template<>
KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamyU64>::ReturnType
-KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamyU64>::create<Vc::ScalarImpl>(ParamType param)
+KoOptimizedCompositeOpFactoryPerArch<KoOptimizedCompositeOpAlphaDarkenCreamyU64>::create<xsimd::generic>(ParamType param)
{
return new KoCompositeOpAlphaDarken<KoBgrU16Traits, KoAlphaDarkenParamsWrapperCreamy>(param);
}
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpOver128.h b/libs/pigment/compositeops/KoOptimizedCompositeOpOver128.h
index 8c0d6c8a04..84575e4bc7 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpOver128.h
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpOver128.h
@@ -1,5 +1,6 @@
/*
* SPDX-FileCopyrightText: 2015 Thorsten Zachmann <zachmann at kde.org>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: LGPL-2.0-or-later
*/
@@ -34,7 +35,7 @@ struct OverCompositor128 {
};
// \see docs in AlphaDarkenCompositor32
- template<bool haveMask, bool src_aligned, Vc::Implementation _impl>
+ template<bool haveMask, bool src_aligned, typename _impl>
static ALWAYS_INLINE void compositeVector(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, const ParamsWrapper &oparams)
{
#if INFO_DEBUG
@@ -48,52 +49,55 @@ struct OverCompositor128 {
qInfo() << "count" << countOne << countTwo << countThree << countFour << countTotal << opacity;
}
#endif
+ using float_v = typename KoStreamedMath<_impl>::float_v;
+ using float_m = typename float_v::batch_bool_type;
+
Q_UNUSED(oparams);
- Vc::float_v src_alpha;
- Vc::float_v dst_alpha;
+ float_v src_alpha;
+ float_v dst_alpha;
- Vc::float_v src_c1;
- Vc::float_v src_c2;
- Vc::float_v src_c3;
+ float_v src_c1;
+ float_v src_c2;
+ float_v src_c3;
PixelWrapper<channels_type, _impl> dataWrapper;
dataWrapper.read(const_cast<quint8*>(src), src_c1, src_c2, src_c3, src_alpha);
//bool haveOpacity = opacity != 1.0;
- const Vc::float_v opacity_norm_vec(opacity);
+ const float_v opacity_norm_vec(opacity);
src_alpha *= opacity_norm_vec;
if (haveMask) {
- const Vc::float_v uint8MaxRec1((float)1.0 / 255);
- Vc::float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
+ const float_v uint8MaxRec1((float)1.0 / 255);
+ float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
src_alpha *= mask_vec * uint8MaxRec1;
}
- const Vc::float_v zeroValue(static_cast<float>(NATIVE_OPACITY_TRANSPARENT));
+ const float_v zeroValue(static_cast<float>(NATIVE_OPACITY_TRANSPARENT));
// The source cannot change the colors in the destination,
// since its fully transparent
- if ((src_alpha == zeroValue).isFull()) {
+ if (xsimd::all(src_alpha == zeroValue)) {
#if INFO_DEBUG
countFour++;
#endif
return;
}
- Vc::float_v dst_c1;
- Vc::float_v dst_c2;
- Vc::float_v dst_c3;
+ float_v dst_c1;
+ float_v dst_c2;
+ float_v dst_c3;
dataWrapper.read(dst, dst_c1, dst_c2, dst_c3, dst_alpha);
- Vc::float_v src_blend;
- Vc::float_v new_alpha;
+ float_v src_blend;
+ float_v new_alpha;
- const Vc::float_v oneValue(1.0f);
- if ((dst_alpha == oneValue).isFull()) {
+ const float_v oneValue(1.0f);
+ if (xsimd::all(dst_alpha == oneValue)) {
new_alpha = dst_alpha;
src_blend = src_alpha;
- } else if ((dst_alpha == zeroValue).isFull()) {
+ } else if (xsimd::all(dst_alpha == zeroValue)) {
new_alpha = src_alpha;
src_blend = oneValue;
} else {
@@ -102,12 +106,12 @@ struct OverCompositor128 {
* which will result in NaN values while division.
*/
new_alpha = dst_alpha + (oneValue - dst_alpha) * src_alpha;
- Vc::float_m mask = (new_alpha == zeroValue);
+ float_m mask = (new_alpha == zeroValue);
src_blend = src_alpha / new_alpha;
- src_blend.setZero(mask);
+ src_blend = xsimd::set_zero(src_blend, mask);
}
- if (!(src_blend == oneValue).isFull()) {
+ if (!xsimd::all(src_blend == oneValue)) {
#if INFO_DEBUG
++countOne;
#endif
@@ -125,8 +129,12 @@ struct OverCompositor128 {
}
}
- template <bool haveMask, Vc::Implementation _impl>
- static ALWAYS_INLINE void compositeOnePixelScalar(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, const ParamsWrapper &oparams)
+ template<bool haveMask, typename _impl>
+ static ALWAYS_INLINE void compositeOnePixelScalar(const quint8 *src,
+ quint8 *dst,
+ const quint8 *mask,
+ float opacity,
+ const ParamsWrapper &oparams)
{
using namespace Arithmetic;
const qint32 alpha_pos = 3;
@@ -229,7 +237,7 @@ struct OverCompositor128 {
* colorspaces with alpha channel placed at the last byte of
* the pixel: C1_C2_C3_A.
*/
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpOver128 : public KoCompositeOp
{
public:
@@ -238,7 +246,7 @@ public:
using KoCompositeOp::composite;
- virtual void composite(const KoCompositeOp::ParameterInfo& params) const
+ void composite(const KoCompositeOp::ParameterInfo& params) const override
{
if(params.maskRowStart) {
composite<true>(params);
@@ -273,7 +281,7 @@ public:
}
};
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpOverU64 : public KoCompositeOp
{
public:
@@ -282,7 +290,7 @@ public:
using KoCompositeOp::composite;
- virtual void composite(const KoCompositeOp::ParameterInfo& params) const
+ void composite(const KoCompositeOp::ParameterInfo& params) const override
{
if(params.maskRowStart) {
composite<true>(params);
diff --git a/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h b/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
index bc38b3f8b9..5a243353d0 100644
--- a/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
+++ b/libs/pigment/compositeops/KoOptimizedCompositeOpOver32.h
@@ -1,6 +1,7 @@
/*
* SPDX-FileCopyrightText: 2006 Cyrille Berger <cberger at cberger.net>
* SPDX-FileCopyrightText: 2011 Silvio Heinrich <plassy at web.de>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: LGPL-2.0-or-later
*/
@@ -13,38 +14,6 @@
#include "KoStreamedMath.h"
-template<Vc::Implementation _impl>
-struct OptiDiv {
- static ALWAYS_INLINE float divScalar(const float& divident, const float& divisor) {
-#ifdef __SSE__
- float result;
-
- __m128 x = _mm_set_ss(divisor);
- __m128 y = _mm_set_ss(divident);
- x = _mm_rcp_ss(x);
- x = _mm_mul_ss(x, y);
-
-
- _mm_store_ss(&result, x);
- return result;
-#else
- return divident / divisor;
-#endif
-
- }
-
- static ALWAYS_INLINE Vc::float_v divVector(Vc::float_v::AsArg divident, Vc::float_v::AsArg divisor) {
-#ifdef __SSE__
- return divident * Vc::reciprocal(divisor);
-#else
- return divident / divisor;
-#endif
-
- }
-
-};
-
-
template<typename channels_type, typename pixel_type, bool alphaLocked, bool allChannelsFlag>
struct OverCompositor32 {
struct ParamsWrapper {
@@ -56,56 +25,58 @@ struct OverCompositor32 {
};
// \see docs in AlphaDarkenCompositor32
- template<bool haveMask, bool src_aligned, Vc::Implementation _impl>
+ template<bool haveMask, bool src_aligned, typename _impl>
static ALWAYS_INLINE void compositeVector(const quint8 *src, quint8 *dst, const quint8 *mask, float opacity, const ParamsWrapper &oparams)
{
Q_UNUSED(oparams);
- Vc::float_v src_alpha;
- Vc::float_v dst_alpha;
+ using float_v = typename KoStreamedMath<_impl>::float_v;
+
+ float_v src_alpha;
+ float_v dst_alpha;
src_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<src_aligned>(src);
bool haveOpacity = opacity != 1.0f;
- Vc::float_v opacity_norm_vec(opacity);
+ float_v opacity_norm_vec(opacity);
- Vc::float_v uint8Max(255.0f);
- Vc::float_v uint8MaxRec1(1.0f / 255.0f);
- Vc::float_v zeroValue(Vc::Zero);
- Vc::float_v oneValue(Vc::One);
+ float_v uint8Max(255.0f);
+ float_v uint8MaxRec1(1.0f / 255.0f);
+ float_v zeroValue(0);
+ float_v oneValue(1);
src_alpha *= opacity_norm_vec;
if (haveMask) {
- Vc::float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
+ float_v mask_vec = KoStreamedMath<_impl>::fetch_mask_8(mask);
src_alpha *= mask_vec * uint8MaxRec1;
}
// The source cannot change the colors in the destination,
// since its fully transparent
- if ((src_alpha == zeroValue).isFull()) {
+ if (xsimd::all(src_alpha == zeroValue)) {
return;
}
dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<true>(dst);
- Vc::float_v src_c1;
- Vc::float_v src_c2;
- Vc::float_v src_c3;
+ float_v src_c1;
+ float_v src_c2;
+ float_v src_c3;
- Vc::float_v dst_c1;
- Vc::float_v dst_c2;
- Vc::float_v dst_c3;
+ float_v dst_c1;
+ float_v dst_c2;
+ float_v dst_c3;
KoStreamedMath<_impl>::template fetch_colors_32<src_aligned>(src, src_c1, src_c2, src_c3);
- Vc::float_v src_blend;
- Vc::float_v new_alpha;
+ float_v src_blend;
+ float_v new_alpha;
- if ((dst_alpha == uint8Max).isFull()) {
+ if (xsimd::all(dst_alpha == uint8Max)) {
new_alpha = dst_alpha;
src_blend = src_alpha * uint8MaxRec1;
- } else if ((dst_alpha == zeroValue).isFull()) {
+ } else if (xsimd::all(dst_alpha == zeroValue)) {
new_alpha = src_alpha;
src_blend = oneValue;
} else {
@@ -123,7 +94,7 @@ struct OverCompositor32 {
}
- if (!(src_blend == oneValue).isFull()) {
+ if (!xsimd::all(src_blend == oneValue)) {
KoStreamedMath<_impl>::template fetch_colors_32<true>(dst, dst_c1, dst_c2, dst_c3);
dst_c1 = src_blend * (src_c1 - dst_c1) + dst_c1;
@@ -132,7 +103,7 @@ struct OverCompositor32 {
} else {
if (!haveMask && !haveOpacity) {
- memcpy(dst, src, 4 * Vc::float_v::size());
+ memcpy(dst, src, 4 * float_v::size);
return;
} else {
// opacity has changed the alpha of the source,
@@ -146,7 +117,7 @@ struct OverCompositor32 {
KoStreamedMath<_impl>::write_channels_32(dst, new_alpha, dst_c1, dst_c2, dst_c3);
}
- template <bool haveMask, Vc::Implementation _impl>
+ template <bool haveMask, typename _impl>
static ALWAYS_INLINE void compositeOnePixelScalar(const channels_type *src, channels_type *dst, const quint8 *mask, float opacity, const ParamsWrapper &oparams)
{
using namespace Arithmetic;
@@ -227,7 +198,7 @@ struct OverCompositor32 {
* colorspaces with alpha channel placed at the last byte of
* the pixel: C1_C2_C3_A.
*/
-template<Vc::Implementation _impl>
+template<typename _impl>
class KoOptimizedCompositeOpOver32 : public KoCompositeOp
{
public:
@@ -236,7 +207,7 @@ public:
using KoCompositeOp::composite;
- virtual void composite(const KoCompositeOp::ParameterInfo& params) const
+ void composite(const KoCompositeOp::ParameterInfo& params) const override
{
if(params.maskRowStart) {
composite<true>(params);
diff --git a/libs/pigment/compositeops/KoStreamedMath.h b/libs/pigment/compositeops/KoStreamedMath.h
index 797930191e..59491c99d4 100644
--- a/libs/pigment/compositeops/KoStreamedMath.h
+++ b/libs/pigment/compositeops/KoStreamedMath.h
@@ -1,6 +1,7 @@
/*
* SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73 at gmail.com>
* SPDX-FileCopyrightText: 2020 Mathias Wein <lynx.mw+kde at gmail.com>
+ * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy at amyspark.me>
*
* SPDX-License-Identifier: LGPL-2.1-or-later
*/
@@ -8,39 +9,21 @@
#ifndef __KOSTREAMED_MATH_H
#define __KOSTREAMED_MATH_H
-#if defined _MSC_VER
-// Lets shut up the "possible loss of data" and "forcing value to bool 'true' or 'false'
-#pragma warning ( push )
-#pragma warning ( disable : 4146 ) // avx/detail.h
-#pragma warning ( disable : 4244 )
-#pragma warning ( disable : 4267 ) // interleavedmemory
-#pragma warning ( disable : 4800 )
-#endif
-#include <Vc/Vc>
-#include <Vc/IO>
-#include <immintrin.h>
-#if defined _MSC_VER
-#pragma warning ( pop )
-#endif
-
#include <cstdint>
#include <iostream>
#include <type_traits>
+#include <xsimd_extensions/xsimd.hpp>
+
#include <KoAlwaysInline.h>
-#include <KoCompositeOp.h>
#include <KoColorSpaceMaths.h>
+#include <KoCompositeOp.h>
#define BLOCKDEBUG 0
-#if !defined _MSC_VER
-#pragma GCC diagnostic ignored "-Wcast-align"
-#endif
-
-
-template<Vc::Implementation _impl>
+template<typename _impl>
struct OptiRound {
- ALWAYS_INLINE
- static float roundScalar(const float& value) {
+ ALWAYS_INLINE static float roundScalar(const float &value)
+ {
#ifdef __SSE4_1__
// SSE/AVX instructions use round-to-even rounding rule so we
// should reuse it when possible
@@ -56,425 +39,412 @@ struct OptiRound {
#else
return value + 0.5f;
#endif
-
}
};
-template<Vc::Implementation _impl>
-struct KoStreamedMath {
-
-using int_v = Vc::SimdArray<int, Vc::float_v::size()>;
-using uint_v = Vc::SimdArray<unsigned int, Vc::float_v::size()>;
+template<typename _impl>
+struct OptiDiv {
+ using float_v = xsimd::batch<float, _impl>;
+ ALWAYS_INLINE static float divScalar(const float ÷nt, const float &divisor)
+ {
+#ifdef __SSE__
+ float result = NAN;
-/**
- * Composes src into dst without using vector instructions
- */
-template<bool useMask, bool useFlow, class Compositor, int pixelSize>
- static void genericComposite_novector(const KoCompositeOp::ParameterInfo& params)
-{
- using namespace Arithmetic;
+ __m128 x = _mm_set_ss(divisor);
+ __m128 y = _mm_set_ss(divident);
+ x = _mm_rcp_ss(x);
+ x = _mm_mul_ss(x, y);
- const qint32 linearInc = pixelSize;
- qint32 srcLinearInc = params.srcRowStride ? pixelSize : 0;
+ _mm_store_ss(&result, x);
+ return result;
+#elif defined __ARM_NEON
+ auto x = vdupq_n_f32(divisor);
+ auto y = vdupq_n_f32(divident);
+ x = vrecpeq_f32(x);
+ x = vmulq_f32(x, y);
- quint8* dstRowStart = params.dstRowStart;
- const quint8* maskRowStart = params.maskRowStart;
- const quint8* srcRowStart = params.srcRowStart;
- typename Compositor::ParamsWrapper paramsWrapper(params);
+ return vgetq_lane_f32(x, 0);
+#else
+ return (1.f / divisor) * divident;
+#endif
+ }
- for(qint32 r = params.rows; r > 0; --r) {
- const quint8 *mask = maskRowStart;
- const quint8 *src = srcRowStart;
- quint8 *dst = dstRowStart;
+ ALWAYS_INLINE static float_v divVector(const float_v ÷nt, const float_v &divisor)
+ {
+ return divident * xsimd::reciprocal(divisor);
+ }
+};
- int blockRest = params.cols;
+template<typename _impl>
+struct KoStreamedMath {
+ using int_v = xsimd::batch<int, _impl>;
+ using uint_v = xsimd::batch<unsigned int, _impl>;
+ using float_v = xsimd::batch<float, _impl>;
- for(int i = 0; i < blockRest; i++) {
- Compositor::template compositeOnePixelScalar<useMask, _impl>(src, dst, mask, params.opacity, paramsWrapper);
- src += srcLinearInc;
- dst += linearInc;
+ static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
+ static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
- if (useMask) {
- mask++;
+ /**
+ * Composes src into dst without using vector instructions
+ */
+ template<bool useMask, bool useFlow, class Compositor, int pixelSize>
+ static void genericComposite_novector(const KoCompositeOp::ParameterInfo ¶ms)
+ {
+ using namespace Arithmetic;
+
+ const qint32 linearInc = pixelSize;
+ qint32 srcLinearInc = params.srcRowStride ? pixelSize : 0;
+
+ quint8 *dstRowStart = params.dstRowStart;
+ const quint8 *maskRowStart = params.maskRowStart;
+ const quint8 *srcRowStart = params.srcRowStart;
+ typename Compositor::ParamsWrapper paramsWrapper(params);
+
+ for (qint32 r = params.rows; r > 0; --r) {
+ const quint8 *mask = maskRowStart;
+ const quint8 *src = srcRowStart;
+ quint8 *dst = dstRowStart;
+
+ int blockRest = params.cols;
+
+ for (int i = 0; i < blockRest; i++) {
+ Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
+ dst,
+ mask,
+ params.opacity,
+ paramsWrapper);
+ src += srcLinearInc;
+ dst += linearInc;
+
+ if (useMask) {
+ mask++;
+ }
}
- }
- srcRowStart += params.srcRowStride;
- dstRowStart += params.dstRowStride;
+ srcRowStart += params.srcRowStride;
+ dstRowStart += params.dstRowStride;
- if (useMask) {
- maskRowStart += params.maskRowStride;
+ if (useMask) {
+ maskRowStart += params.maskRowStride;
+ }
}
}
-}
-template<bool useMask, bool useFlow, class Compositor>
- static void genericComposite32_novector(const KoCompositeOp::ParameterInfo& params)
-{
- genericComposite_novector<useMask, useFlow, Compositor, 4>(params);
-}
+ template<bool useMask, bool useFlow, class Compositor>
+ static void genericComposite32_novector(const KoCompositeOp::ParameterInfo ¶ms)
+ {
+ genericComposite_novector<useMask, useFlow, Compositor, 4>(params);
+ }
-template<bool useMask, bool useFlow, class Compositor>
- static void genericComposite128_novector(const KoCompositeOp::ParameterInfo& params)
-{
- genericComposite_novector<useMask, useFlow, Compositor, 16>(params);
-}
+ template<bool useMask, bool useFlow, class Compositor>
+ static void genericComposite128_novector(const KoCompositeOp::ParameterInfo ¶ms)
+ {
+ genericComposite_novector<useMask, useFlow, Compositor, 16>(params);
+ }
-template<bool useMask, bool useFlow, class Compositor>
- static void genericComposite64_novector(const KoCompositeOp::ParameterInfo& params)
-{
- genericComposite_novector<useMask, useFlow, Compositor, 8>(params);
-}
+ template<bool useMask, bool useFlow, class Compositor>
+ static void genericComposite64_novector(const KoCompositeOp::ParameterInfo ¶ms)
+ {
+ genericComposite_novector<useMask, useFlow, Compositor, 8>(params);
+ }
-static inline quint8 round_float_to_u8(float x) {
- return OptiRound<_impl>::roundScalar(x);
-}
+ static inline quint8 round_float_to_u8(float x)
+ {
+ return OptiRound<_impl>::roundScalar(x);
+ }
-static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha) {
- return round_float_to_u8(qint16(b - a) * alpha + a);
-}
+ static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha)
+ {
+ return round_float_to_u8(float(b - a) * alpha + float(a));
+ }
-/**
- * Round a vector of floats to the next corresponding integers.
- */
-static inline int_v iRound(Vc::float_v::AsArg a)
-{
-#if defined(Vc_IMPL_AVX2)
- return Vc::simd_cast<int_v>(Vc::int_v(_mm256_cvtps_epi32(a.data())));
-#elif defined(Vc_IMPL_AVX)
/**
- * WARNING: Vc, on AVX, supplies 256-bit floating point vectors but stays
- * in SSE land for integers. It is not possible to cast between Vc::int_v
- * and uint_v without a custom SIMD type, because otherwise we lose the
- * XMM1 register. By using such a type:
- * using avx_int_v = Vc::Vector<Vc::int_v::EntryType, Vc::float_v::abi>;
- * static_assert(int_v::size() == avx_int_v::size(),
- * "uint_v must match the AVX placeholder");
- * and copying the entries manually, a smart compiler can do a single move
- * to memory (Clang 12):
- * mov rax, rdi
- * vcvtps2dq ymm0, ymm0
- * vmovapd ymmword ptr [rdi], ymm0
- * vzeroupper // useless since it's already stored in [rdi]
- * ret
- * GCC 5.5 and 7.3, as well as MSVC, do not optimize such manual copying;
- * but by handling the internal registers themselves (as done below),
- * we achieve the following while still preserving Clang 12's optimization:
- * mov rax, rdi
- * vcvtps2dq ymm0, ymm0
- * vextractf128 XMMWORD PTR [rdi+16], ymm0, 0x1
- * vmovaps XMMWORD PTR [rdi], xmm0 // vmovdqu with MSVC
- * vzeroupper // same as above
- * ret
+ * Get a vector containing first float_v::size values of mask.
+ * Each source mask element is considered to be a 8-bit integer
*/
+ static inline float_v fetch_mask_8(const quint8 *data)
+ {
+ return xsimd::batch_cast<float>(xsimd::load_and_extend<int_v>(data));
+ }
- __m256i temp(_mm256_cvtps_epi32(a.data()));
- int_v res;
+ /**
+ * Get an alpha values from float_v::size pixels 32-bit each
+ * (4 channels, 8 bit per channel). The alpha value is considered
+ * to be stored in the most significant byte of the pixel
+ *
+ * \p aligned controls whether the \p data is fetched using aligned
+ * instruction or not.
+ * 1) Fetching aligned data with unaligned instruction
+ * degrades performance.
+ * 2) Fetching unaligned data with aligned instruction
+ * causes \#GP (General Protection Exception)
+ */
+ template<bool aligned>
+ static inline float_v fetch_alpha_32(const void *data)
+ {
+ using U = typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
+ const auto data_i = uint_v::load(static_cast<const typename uint_v::value_type *>(data), U{});
+ return xsimd::to_float(xsimd::bitwise_cast<int_v>(data_i >> 24));
+ }
- internal_data(internal_data1(res)) = Vc_1::AVX::hi128(temp);
- internal_data(internal_data0(res)) = Vc_1::AVX::lo128(temp);
+ /**
+ * Get color values from float_v::size pixels 32-bit each
+ * (4 channels, 8 bit per channel). The color data is considered
+ * to be stored in the 3 least significant bytes of the pixel.
+ *
+ * \p aligned controls whether the \p data is fetched using aligned
+ * instruction or not.
+ * 1) Fetching aligned data with unaligned instruction
+ * degrades performance.
+ * 2) Fetching unaligned data with aligned instruction
+ * causes \#GP (General Protection Exception)
+ */
+ template<bool aligned>
+ static inline void fetch_colors_32(const void *data, float_v &c1, float_v &c2, float_v &c3)
+ {
+ using U = typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
- return res;
-#elif defined(Vc_IMPL_SSE2)
- return Vc::simd_cast<int_v>(Vc::int_v(_mm_cvtps_epi32(a.data())));
-#else
- return Vc::simd_cast<int_v>(Vc::round(a));
-#endif
-}
+ const auto data_i = uint_v::load(static_cast<const typename uint_v::value_type *>(data), U{});
-/**
- * Get a vector containing first Vc::float_v::size() values of mask.
- * Each source mask element is considered to be a 8-bit integer
- */
-static inline Vc::float_v fetch_mask_8(const quint8 *data) {
- uint_v data_i(data);
- return Vc::simd_cast<Vc::float_v>(int_v(data_i));
-}
+ const uint_v mask(0xFF);
-/**
- * Get an alpha values from Vc::float_v::size() pixels 32-bit each
- * (4 channels, 8 bit per channel). The alpha value is considered
- * to be stored in the most significant byte of the pixel
- *
- * \p aligned controls whether the \p data is fetched using aligned
- * instruction or not.
- * 1) Fetching aligned data with unaligned instruction
- * degrades performance.
- * 2) Fetching unaligned data with aligned instruction
- * causes \#GP (General Protection Exception)
- */
-template <bool aligned>
-static inline Vc::float_v fetch_alpha_32(const void *data) {
- uint_v data_i;
- if (aligned) {
- data_i.load(static_cast<const quint32*>(data), Vc::Aligned);
- } else {
- data_i.load(static_cast<const quint32 *>(data), Vc::Unaligned);
+ c1 = xsimd::to_float(xsimd::bitwise_cast<int_v>((data_i >> 16) & mask));
+ c2 = xsimd::to_float(xsimd::bitwise_cast<int_v>((data_i >> 8) & mask));
+ c3 = xsimd::to_float(xsimd::bitwise_cast<int_v>((data_i) & mask));
}
- return Vc::simd_cast<Vc::float_v>(int_v(data_i >> 24));
-}
+ /**
+ * Pack color and alpha values to float_v::size pixels 32-bit each
+ * (4 channels, 8 bit per channel). The color data is considered
+ * to be stored in the 3 least significant bytes of the pixel, alpha -
+ * in the most significant byte
+ *
+ * NOTE: \p data must be aligned pointer!
+ */
+ static inline void
+ write_channels_32(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
+ {
+ const int_v mask(0xFF);
-/**
- * Get color values from Vc::float_v::size() pixels 32-bit each
- * (4 channels, 8 bit per channel). The color data is considered
- * to be stored in the 3 least significant bytes of the pixel.
- *
- * \p aligned controls whether the \p data is fetched using aligned
- * instruction or not.
- * 1) Fetching aligned data with unaligned instruction
- * degrades performance.
- * 2) Fetching unaligned data with aligned instruction
- * causes \#GP (General Protection Exception)
- */
-template <bool aligned>
-static inline void fetch_colors_32(const void *data,
- Vc::float_v &c1,
- Vc::float_v &c2,
- Vc::float_v &c3) {
- int_v data_i;
- if (aligned) {
- data_i.load(static_cast<const quint32*>(data), Vc::Aligned);
- } else {
- data_i.load(static_cast<const quint32*>(data), Vc::Unaligned);
- }
-
- const quint32 lowByteMask = 0xFF;
- uint_v mask(lowByteMask);
-
- c1 = Vc::simd_cast<Vc::float_v>(int_v((data_i >> 16) & mask));
- c2 = Vc::simd_cast<Vc::float_v>(int_v((data_i >> 8) & mask));
- c3 = Vc::simd_cast<Vc::float_v>(int_v( data_i & mask));
-}
+ const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
+ const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
+ const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
+ const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
+ xsimd::store_aligned(static_cast<typename int_v::value_type *>(data), (v1 | v2) | (v3 | v4));
+ }
-/**
- * Pack color and alpha values to Vc::float_v::size() pixels 32-bit each
- * (4 channels, 8 bit per channel). The color data is considered
- * to be stored in the 3 least significant bytes of the pixel, alpha -
- * in the most significant byte
- *
- * NOTE: \p data must be aligned pointer!
- */
-static inline void write_channels_32(void *data,
- Vc::float_v::AsArg alpha,
- Vc::float_v::AsArg c1,
- Vc::float_v::AsArg c2,
- Vc::float_v::AsArg c3) {
- const quint32 lowByteMask = 0xFF;
-
- uint_v mask(lowByteMask);
- uint_v v1 = uint_v(iRound(alpha)) << 24;
- uint_v v2 = (uint_v(iRound(c1)) & mask) << 16;
- uint_v v3 = (uint_v(iRound(c2)) & mask) << 8;
- uint_v v4 = uint_v(iRound(c3)) & mask;
- v1 = v1 | v2;
- v3 = v3 | v4;
- (v1 | v3).store(static_cast<quint32*>(data), Vc::Aligned);
-}
+ static inline void
+ write_channels_32_unaligned(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
+ {
+ const int_v mask(0xFF);
-static inline void write_channels_32_unaligned(void *data,
- Vc::float_v::AsArg alpha,
- Vc::float_v::AsArg c1,
- Vc::float_v::AsArg c2,
- Vc::float_v::AsArg c3) {
- const quint32 lowByteMask = 0xFF;
-
- uint_v mask(lowByteMask);
- uint_v v1 = uint_v(iRound(alpha)) << 24;
- uint_v v2 = (uint_v(iRound(c1)) & mask) << 16;
- uint_v v3 = (uint_v(iRound(c2)) & mask) << 8;
- uint_v v4 = uint_v(iRound(c3)) & mask;
- v1 = v1 | v2;
- v3 = v3 | v4;
- (v1 | v3).store(static_cast<quint32*>(data), Vc::Unaligned);
-}
+ const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
+ const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
+ const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
+ const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
+ xsimd::store_unaligned(static_cast<typename int_v::value_type *>(data), (v1 | v2) | (v3 | v4));
+ }
-/**
- * Composes src pixels into dst pixles. Is optimized for 32-bit-per-pixel
- * colorspaces. Uses \p Compositor strategy parameter for doing actual
- * math of the composition
- */
-template<bool useMask, bool useFlow, class Compositor, int pixelSize>
- static void genericComposite(const KoCompositeOp::ParameterInfo& params)
-{
- using namespace Arithmetic;
-
- const int vectorSize = static_cast<int>(Vc::float_v::size());
- const qint32 vectorInc = pixelSize * vectorSize;
- const qint32 linearInc = pixelSize;
- qint32 srcVectorInc = vectorInc;
- qint32 srcLinearInc = pixelSize;
-
- quint8 *dstRowStart = params.dstRowStart;
- const quint8 *maskRowStart = params.maskRowStart;
- const quint8* srcRowStart = params.srcRowStart;
- typename Compositor::ParamsWrapper paramsWrapper(params);
-
- if (!params.srcRowStride) {
- if (pixelSize == 4) {
- KoStreamedMath::uint_v *buf = reinterpret_cast<KoStreamedMath::uint_v*>(Vc::malloc<quint32, Vc::AlignOnVector>(vectorSize));
- *buf = uint_v(*(reinterpret_cast<const quint32 *>(srcRowStart)));
- srcRowStart = reinterpret_cast<quint8*>(buf);
- srcLinearInc = 0;
- srcVectorInc = 0;
- } else {
- quint8 *buf = Vc::malloc<quint8, Vc::AlignOnVector>(vectorInc);
- quint8 *ptr = buf;
-
- for (size_t i = 0; i < vectorSize; i++) {
- memcpy(ptr, params.srcRowStart, pixelSize);
- ptr += pixelSize;
+ /**
+ * Composes src pixels into dst pixles. Is optimized for 32-bit-per-pixel
+ * colorspaces. Uses \p Compositor strategy parameter for doing actual
+ * math of the composition
+ */
+ template<bool useMask, bool useFlow, class Compositor, int pixelSize>
+ static void genericComposite(const KoCompositeOp::ParameterInfo ¶ms)
+ {
+ using namespace Arithmetic;
+
+ const int vectorSize = static_cast<int>(float_v::size);
+ const qint32 vectorInc = pixelSize * vectorSize;
+ const qint32 linearInc = pixelSize;
+ qint32 srcVectorInc = vectorInc;
+ qint32 srcLinearInc = pixelSize;
+
+ quint8 *dstRowStart = params.dstRowStart;
+ const quint8 *maskRowStart = params.maskRowStart;
+ const quint8 *srcRowStart = params.srcRowStart;
+ typename Compositor::ParamsWrapper paramsWrapper(params);
+
+ if (!params.srcRowStride) {
+ if (pixelSize == 4) {
+ auto *buf = reinterpret_cast<uint_v *>(xsimd::vector_aligned_malloc<typename uint_v::value_type>(vectorSize));
+ *buf = uint_v(*(reinterpret_cast<const quint32 *>(srcRowStart)));
+ srcRowStart = reinterpret_cast<quint8 *>(buf);
+ srcLinearInc = 0;
+ srcVectorInc = 0;
+ } else {
+ auto *buf = xsimd::vector_aligned_malloc<quint8>(vectorInc);
+ quint8 *ptr = buf;
+
+ for (size_t i = 0; i < vectorSize; i++) {
+ memcpy(ptr, params.srcRowStart, pixelSize);
+ ptr += pixelSize;
+ }
+
+ srcRowStart = buf;
+ srcLinearInc = 0;
+ srcVectorInc = 0;
}
-
- srcRowStart = buf;
- srcLinearInc = 0;
- srcVectorInc = 0;
}
- }
#if BLOCKDEBUG
- int totalBlockAlign = 0;
- int totalBlockAlignedVector = 0;
- int totalBlockUnalignedVector = 0;
- int totalBlockRest = 0;
+ int totalBlockAlign = 0;
+ int totalBlockAlignedVector = 0;
+ int totalBlockUnalignedVector = 0;
+ int totalBlockRest = 0;
#endif
- for (qint32 r = params.rows; r > 0; --r) {
- // Hint: Mask is allowed to be unaligned
- const quint8 *mask = maskRowStart;
-
- const quint8 *src = srcRowStart;
- quint8 *dst = dstRowStart;
-
- const int pixelsAlignmentMask = vectorSize * sizeof(float) - 1;
- uintptr_t srcPtrValue = reinterpret_cast<uintptr_t>(src);
- uintptr_t dstPtrValue = reinterpret_cast<uintptr_t>(dst);
- uintptr_t srcAlignment = srcPtrValue & pixelsAlignmentMask;
- uintptr_t dstAlignment = dstPtrValue & pixelsAlignmentMask;
-
- // Uncomment if facing problems with alignment:
- // Q_ASSERT_X(!(dstAlignment & 3), "Compositioning",
- // "Pixel data must be aligned on pixels borders!");
-
- int blockAlign = params.cols;
- int blockAlignedVector = 0;
- int blockUnalignedVector = 0;
- int blockRest = 0;
-
- int *vectorBlock =
- srcAlignment == dstAlignment || !srcVectorInc ?
- &blockAlignedVector : &blockUnalignedVector;
-
- if (!dstAlignment) {
- blockAlign = 0;
- *vectorBlock = params.cols / vectorSize;
- blockRest = params.cols % vectorSize;
- } else if (params.cols > 2 * vectorSize) {
- blockAlign = (vectorInc - dstAlignment) / pixelSize;
- const int restCols = params.cols - blockAlign;
- if (restCols > 0) {
- *vectorBlock = restCols / vectorSize;
- blockRest = restCols % vectorSize;
+ for (qint32 r = params.rows; r > 0; --r) {
+ // Hint: Mask is allowed to be unaligned
+ const quint8 *mask = maskRowStart;
+
+ const quint8 *src = srcRowStart;
+ quint8 *dst = dstRowStart;
+
+ const int pixelsAlignmentMask = vectorSize * sizeof(float) - 1;
+ auto srcPtrValue = reinterpret_cast<uintptr_t>(src);
+ auto dstPtrValue = reinterpret_cast<uintptr_t>(dst);
+ uintptr_t srcAlignment = srcPtrValue & pixelsAlignmentMask;
+ uintptr_t dstAlignment = dstPtrValue & pixelsAlignmentMask;
+
+ // Uncomment if facing problems with alignment:
+ // Q_ASSERT_X(!(dstAlignment & 3), "Compositioning",
+ // "Pixel data must be aligned on pixels borders!");
+
+ int blockAlign = params.cols;
+ int blockAlignedVector = 0;
+ int blockUnalignedVector = 0;
+ int blockRest = 0;
+
+ int *vectorBlock =
+ srcAlignment == dstAlignment || !srcVectorInc ? &blockAlignedVector : &blockUnalignedVector;
+
+ if (!dstAlignment) {
+ blockAlign = 0;
+ *vectorBlock = params.cols / vectorSize;
+ blockRest = params.cols % vectorSize;
+ } else if (params.cols > 2 * vectorSize) {
+ blockAlign = (vectorInc - dstAlignment) / pixelSize;
+ const int restCols = params.cols - blockAlign;
+ if (restCols > 0) {
+ *vectorBlock = restCols / vectorSize;
+ blockRest = restCols % vectorSize;
+ } else {
+ blockAlign = params.cols;
+ *vectorBlock = 0;
+ blockRest = 0;
+ }
}
- else {
- blockAlign = params.cols;
- *vectorBlock = 0;
- blockRest = 0;
- }
- }
#if BLOCKDEBUG
- totalBlockAlign += blockAlign;
- totalBlockAlignedVector += blockAlignedVector;
- totalBlockUnalignedVector += blockUnalignedVector;
- totalBlockRest += blockRest;
+ totalBlockAlign += blockAlign;
+ totalBlockAlignedVector += blockAlignedVector;
+ totalBlockUnalignedVector += blockUnalignedVector;
+ totalBlockRest += blockRest;
#endif
- for(int i = 0; i < blockAlign; i++) {
- Compositor::template compositeOnePixelScalar<useMask, _impl>(src, dst, mask, params.opacity, paramsWrapper);
- src += srcLinearInc;
- dst += linearInc;
-
- if(useMask) {
- mask++;
+ for (int i = 0; i < blockAlign; i++) {
+ Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
+ dst,
+ mask,
+ params.opacity,
+ paramsWrapper);
+ src += srcLinearInc;
+ dst += linearInc;
+
+ if (useMask) {
+ mask++;
+ }
}
- }
-
- for (int i = 0; i < blockAlignedVector; i++) {
- Compositor::template compositeVector<useMask, true, _impl>(src, dst, mask, params.opacity, paramsWrapper);
- src += srcVectorInc;
- dst += vectorInc;
- if (useMask) {
- mask += vectorSize;
+ for (int i = 0; i < blockAlignedVector; i++) {
+ Compositor::template compositeVector<useMask, true, _impl>(src,
+ dst,
+ mask,
+ params.opacity,
+ paramsWrapper);
+ src += srcVectorInc;
+ dst += vectorInc;
+
+ if (useMask) {
+ mask += vectorSize;
+ }
}
- }
-
- for (int i = 0; i < blockUnalignedVector; i++) {
- Compositor::template compositeVector<useMask, false, _impl>(src, dst, mask, params.opacity, paramsWrapper);
- src += srcVectorInc;
- dst += vectorInc;
- if (useMask) {
- mask += vectorSize;
+ for (int i = 0; i < blockUnalignedVector; i++) {
+ Compositor::template compositeVector<useMask, false, _impl>(src,
+ dst,
+ mask,
+ params.opacity,
+ paramsWrapper);
+ src += srcVectorInc;
+ dst += vectorInc;
+
+ if (useMask) {
+ mask += vectorSize;
+ }
}
- }
+ for (int i = 0; i < blockRest; i++) {
+ Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
+ dst,
+ mask,
+ params.opacity,
+ paramsWrapper);
+ src += srcLinearInc;
+ dst += linearInc;
+
+ if (useMask) {
+ mask++;
+ }
+ }
- for(int i = 0; i < blockRest; i++) {
- Compositor::template compositeOnePixelScalar<useMask, _impl>(src, dst, mask, params.opacity, paramsWrapper);
- src += srcLinearInc;
- dst += linearInc;
+ srcRowStart += params.srcRowStride;
+ dstRowStart += params.dstRowStride;
if (useMask) {
- mask++;
+ maskRowStart += params.maskRowStride;
}
}
- srcRowStart += params.srcRowStride;
- dstRowStart += params.dstRowStride;
-
- if (useMask) {
- maskRowStart += params.maskRowStride;
- }
- }
-
#if BLOCKDEBUG
- dbgPigment << "I" << "rows:" << params.rows
- << "\tpad(S):" << totalBlockAlign
- << "\tbav(V):" << totalBlockAlignedVector
- << "\tbuv(V):" << totalBlockUnalignedVector
- << "\tres(S)" << totalBlockRest; // << srcAlignment << dstAlignment;
+ dbgPigment << "I"
+ << "rows:" << params.rows << "\tpad(S):" << totalBlockAlign << "\tbav(V):" << totalBlockAlignedVector
+ << "\tbuv(V):" << totalBlockUnalignedVector << "\tres(S)"
+ << totalBlockRest; // << srcAlignment << dstAlignment;
#endif
- if (!params.srcRowStride) {
- Vc::free<float>(reinterpret_cast<float*>(const_cast<quint8*>(srcRowStart)));
+ if (!params.srcRowStride) {
+ xsimd::vector_aligned_free(srcRowStart);
+ }
}
-}
-
-template<bool useMask, bool useFlow, class Compositor>
- static void genericComposite32(const KoCompositeOp::ParameterInfo& params)
-{
- genericComposite<useMask, useFlow, Compositor, 4>(params);
-}
-template<bool useMask, bool useFlow, class Compositor>
- static void genericComposite128(const KoCompositeOp::ParameterInfo& params)
-{
- genericComposite<useMask, useFlow, Compositor, 16>(params);
-}
+ template<bool useMask, bool useFlow, class Compositor>
+ static void genericComposite32(const KoCompositeOp::ParameterInfo ¶ms)
+ {
+ genericComposite<useMask, useFlow, Compositor, 4>(params);
+ }
-template<bool useMask, bool useFlow, class Compositor>
- static void genericComposite64(const KoCompositeOp::ParameterInfo& params)
-{
- genericComposite<useMask, useFlow, Compositor, 8>(params);
-}
+ template<bool useMask, bool useFlow, class Compositor>
+ static void genericComposite128(const KoCompositeOp::ParameterInfo ¶ms)
+ {
+ genericComposite<useMask, useFlow, Compositor, 16>(params);
+ }
+ template<bool useMask, bool useFlow, class Compositor>
+ static void genericComposite64(const KoCompositeOp::ParameterInfo ¶ms)
+ {
+ genericComposite<useMask, useFlow, Compositor, 8>(params);
+ }
};
-template<typename channels_type, Vc::Implementation _impl>
+template<typename channels_type, class _impl>
struct PixelStateRecoverHelper {
+ using float_v = xsimd::batch<float, _impl>;
+ using float_m = typename float_v::batch_bool_type;
+
ALWAYS_INLINE
- PixelStateRecoverHelper(const Vc::float_v &c1, const Vc::float_v &c2, const Vc::float_v &c3)
+ PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
{
Q_UNUSED(c1);
Q_UNUSED(c2);
@@ -482,7 +452,7 @@ struct PixelStateRecoverHelper {
}
ALWAYS_INLINE
- void recoverPixels(const Vc::float_m &mask, Vc::float_v &c1, Vc::float_v &c2, Vc::float_v &c3) const {
+ void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const {
Q_UNUSED(mask);
Q_UNUSED(c1);
Q_UNUSED(c2);
@@ -490,10 +460,13 @@ struct PixelStateRecoverHelper {
}
};
-template<Vc::Implementation _impl>
+template<class _impl>
struct PixelStateRecoverHelper<float, _impl> {
+ using float_v = xsimd::batch<float, _impl>;
+ using float_m = typename float_v::batch_bool_type;
+
ALWAYS_INLINE
- PixelStateRecoverHelper(const Vc::float_v &c1, const Vc::float_v &c2, const Vc::float_v &c3)
+ PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
: m_orig_c1(c1),
m_orig_c2(c2),
m_orig_c3(c3)
@@ -501,30 +474,33 @@ struct PixelStateRecoverHelper<float, _impl> {
}
ALWAYS_INLINE
- void recoverPixels(const Vc::float_m &mask, Vc::float_v &c1, Vc::float_v &c2, Vc::float_v &c3) const {
- if (!mask.isEmpty()) {
- c1(mask) = m_orig_c1;
- c2(mask) = m_orig_c2;
- c3(mask) = m_orig_c3;
+ void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const {
+ if (xsimd::any(mask)) {
+ c1 = xsimd::select(mask, m_orig_c1, c1);
+ c2 = xsimd::select(mask, m_orig_c2, c2);
+ c3 = xsimd::select(mask, m_orig_c3, c3);
}
}
private:
- const Vc::float_v m_orig_c1;
- const Vc::float_v m_orig_c2;
- const Vc::float_v m_orig_c3;
+ const float_v m_orig_c1;
+ const float_v m_orig_c2;
+ const float_v m_orig_c3;
};
-template<typename channels_type, Vc::Implementation _impl>
+template<typename channels_type, class _impl>
struct PixelWrapper
{
};
-template<Vc::Implementation _impl>
-struct PixelWrapper<quint16, _impl>
-{
- using int_v = Vc::SimdArray<int, Vc::float_v::size()>;
- using uint_v = Vc::SimdArray<unsigned int, Vc::float_v::size()>;
+template<class _impl>
+struct PixelWrapper<quint16, _impl> {
+ using int_v = xsimd::batch<int, _impl>;
+ using uint_v = xsimd::batch<unsigned int, _impl>;
+ using float_v = xsimd::batch<float, _impl>;
+
+ static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
+ static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
ALWAYS_INLINE
static quint16 lerpMixedUintFloat(quint16 a, quint16 b, float alpha)
@@ -533,154 +509,179 @@ struct PixelWrapper<quint16, _impl>
}
ALWAYS_INLINE
- static quint16 roundFloatToUint(float x) {
+ static quint16 roundFloatToUint(float x)
+ {
return OptiRound<_impl>::roundScalar(x);
}
ALWAYS_INLINE
- static void normalizeAlpha(float &alpha) {
+ static void normalizeAlpha(float &alpha)
+ {
const float uint16Rec1 = 1.0f / 65535.0f;
alpha *= uint16Rec1;
}
ALWAYS_INLINE
- static void denormalizeAlpha(float &alpha) {
+ static void denormalizeAlpha(float &alpha)
+ {
const float uint16Max = 65535.0f;
alpha *= uint16Max;
}
PixelWrapper()
- : mask(quint32(0xFFFF)),
- uint16Max(65535.0f),
- uint16Rec1(1.0f / 65535.0f)
+ : mask(0xFFFF)
+ , uint16Max(65535.0f)
+ , uint16Rec1(1.0f / 65535.0f)
{
}
- ALWAYS_INLINE
- void read(quint8 *dataDst, Vc::float_v &dst_c1, Vc::float_v &dst_c2, Vc::float_v &dst_c3, Vc::float_v &dst_alpha)
+ // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+ ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
{
- struct PackedPixel {
- float rrgg;
- float bbaa;
- };
-
- Vc::InterleavedMemoryWrapper<PackedPixel, Vc::float_v> dataWrapper((PackedPixel*)(dataDst));
- Vc::float_v v1, v2;
- Vc::tie(v1, v2) = dataWrapper[size_t(0)];
- uint_v pixelsC1C2 = uint_v(Vc::reinterpret_components_cast<int_v>(v1));
- uint_v pixelsC3Alpha = uint_v(Vc::reinterpret_components_cast<int_v>(v2));
-
- dst_c1 = Vc::simd_cast<Vc::float_v>(pixelsC1C2 & mask);
- dst_c2 = Vc::simd_cast<Vc::float_v>((pixelsC1C2 >> 16) & mask);
- dst_c3 = Vc::simd_cast<Vc::float_v>(pixelsC3Alpha & mask);
- dst_alpha = Vc::simd_cast<Vc::float_v>((pixelsC3Alpha >> 16) & mask);
+ const auto *srcPtr = static_cast<const typename uint_v::value_type*>(src);
+ // struct PackedPixel {
+ // float rrgg;
+ // float bbaa;
+ // }
+ const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2; // stride == 2
+ const auto idx2 = idx1 + 1; // offset 1 == 2nd members
+
+ const auto pixelsC1C2 = uint_v::gather(srcPtr, idx1);
+ const auto pixelsC3Alpha = uint_v::gather(srcPtr, idx2);
+
+ dst_c1 = xsimd::to_float(xsimd::bitwise_cast<int_v>(pixelsC1C2 & mask)); // r
+ dst_c2 = xsimd::to_float(xsimd::bitwise_cast<int_v>((pixelsC1C2 >> 16) & mask)); // g
+ dst_c3 = xsimd::to_float(xsimd::bitwise_cast<int_v>((pixelsC3Alpha & mask))); // b
+ dst_alpha = xsimd::to_float(xsimd::bitwise_cast<int_v>((pixelsC3Alpha >> 16) & mask)); // a
dst_alpha *= uint16Rec1;
}
- ALWAYS_INLINE
- void write(quint8 *dataDst, Vc::float_v::AsArg c1, Vc::float_v::AsArg c2, Vc::float_v::AsArg c3, Vc::float_v &alpha)
+ ALWAYS_INLINE void
+ // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+ write(void *dst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
{
- alpha *= uint16Max;
+ auto dstPtr = reinterpret_cast<typename int_v::value_type *>(dst);
+
+ const auto alpha = a * uint16Max;
- uint_v v1 = uint_v(int_v(Vc::round(c1)));
- uint_v v2 = uint_v(int_v(Vc::round(c2)));
- uint_v v3 = uint_v(int_v(Vc::round(c3)));
- uint_v v4 = uint_v(int_v(Vc::round(alpha)));
- uint_v c1c2 = ((v2 & mask) << 16) | (v1 & mask);
- uint_v c3ca = ((v4 & mask) << 16) | (v3 & mask);
- std::pair<int_v, int_v> out = Vc::interleave(c1c2, c3ca);
- out.first.store(reinterpret_cast<Vc::uint32_t*>(dataDst), Vc::Aligned);
- out.second.store(reinterpret_cast<Vc::uint32_t*>(dataDst) + out.first.size(), Vc::Aligned);
+ const auto v1 = xsimd::bitwise_cast<uint_v>(xsimd::nearbyint_as_int(c1));
+ const auto v2 = xsimd::bitwise_cast<uint_v>(xsimd::nearbyint_as_int(c2));
+ const auto v3 = xsimd::bitwise_cast<uint_v>(xsimd::nearbyint_as_int(c3));
+ const auto v4 = xsimd::bitwise_cast<uint_v>(xsimd::nearbyint_as_int(alpha));
+
+ const auto c1c2 = ((v2 & mask) << 16) | (v1 & mask);
+ const auto c3ca = ((v4 & mask) << 16) | (v3 & mask);
+
+ const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2;
+ const auto idx2 = idx1 + 1;
+
+ c1c2.scatter(dstPtr, idx1);
+ c3ca.scatter(dstPtr, idx2);
}
ALWAYS_INLINE
- void clearPixels(quint8 *dataDst) {
- memset(dataDst, 0, Vc::float_v::size() * sizeof(quint16) * 4);
+ void clearPixels(quint8 *dataDst)
+ {
+ memset(dataDst, 0, float_v::size * sizeof(quint16) * 4);
}
ALWAYS_INLINE
- void copyPixels(const quint8 *dataSrc, quint8 *dataDst) {
- memcpy(dataDst, dataSrc, Vc::float_v::size() * sizeof(quint16) * 4);
+ void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
+ {
+ memcpy(dataDst, dataSrc, float_v::size * sizeof(quint16) * 4);
}
-
const uint_v mask;
- const Vc::float_v uint16Max;
- const Vc::float_v uint16Rec1;
+ const float_v uint16Max;
+ const float_v uint16Rec1;
};
-template<Vc::Implementation _impl>
-struct PixelWrapper<quint8, _impl>
-{
- using int_v = Vc::SimdArray<int, Vc::float_v::size()>;
- using uint_v = Vc::SimdArray<unsigned int, Vc::float_v::size()>;
+template<typename _impl>
+struct PixelWrapper<quint8, _impl> {
+ using int_v = xsimd::batch<int, _impl>;
+ using uint_v = xsimd::batch<unsigned int, _impl>;
+ using float_v = xsimd::batch<float, _impl>;
+
+ static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
+ static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
ALWAYS_INLINE
- static quint8 lerpMixedUintFloat(quint8 a, quint8 b, float alpha) {
+ static quint8 lerpMixedUintFloat(quint8 a, quint8 b, float alpha)
+ {
return KoStreamedMath<_impl>::lerp_mixed_u8_float(a, b, alpha);
}
ALWAYS_INLINE
- static quint8 roundFloatToUint(float x) {
+ static quint8 roundFloatToUint(float x)
+ {
return KoStreamedMath<_impl>::round_float_to_u8(x);
}
ALWAYS_INLINE
- static void normalizeAlpha(float &alpha) {
+ static void normalizeAlpha(float &alpha)
+ {
const float uint8Rec1 = 1.0f / 255.0f;
alpha *= uint8Rec1;
}
ALWAYS_INLINE
- static void denormalizeAlpha(float &alpha) {
+ static void denormalizeAlpha(float &alpha)
+ {
const float uint8Max = 255.0f;
alpha *= uint8Max;
}
PixelWrapper()
- : mask(quint32(0xFF)),
- uint8Max(255.0f),
- uint8Rec1(1.0f / 255.0f)
+ : mask(quint32(0xFF))
+ , uint8Max(255.0f)
+ , uint8Rec1(1.0f / 255.0f)
{
}
- ALWAYS_INLINE
- void read(quint8 *dataDst, Vc::float_v &dst_c1, Vc::float_v &dst_c2, Vc::float_v &dst_c3, Vc::float_v &dst_alpha)
+ // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+ ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
{
- dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<false>(dataDst);
- KoStreamedMath<_impl>::template fetch_colors_32<false>(dataDst, dst_c1, dst_c2, dst_c3);
+ dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<false>(src);
+ KoStreamedMath<_impl>::template fetch_colors_32<false>(src, dst_c1, dst_c2, dst_c3);
dst_alpha *= uint8Rec1;
}
ALWAYS_INLINE
- void write(quint8 *dataDst, Vc::float_v::AsArg c1, Vc::float_v::AsArg c2, Vc::float_v::AsArg c3, Vc::float_v &alpha)
+ void write(quint8 *dataDst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
{
- alpha *= uint8Max;
+ const auto alpha = a * uint8Max;
KoStreamedMath<_impl>::write_channels_32_unaligned(dataDst, alpha, c1, c2, c3);
}
ALWAYS_INLINE
- void clearPixels(quint8 *dataDst) {
- memset(dataDst, 0, Vc::float_v::size() * sizeof(quint8) * 4);
+ void clearPixels(quint8 *dataDst)
+ {
+ memset(dataDst, 0, float_v::size * sizeof(quint8) * 4);
}
ALWAYS_INLINE
- void copyPixels(const quint8 *dataSrc, quint8 *dataDst) {
- memcpy(dataDst, dataSrc, Vc::float_v::size() * sizeof(quint8) * 4);
+ void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
+ {
+ memcpy(dataDst, dataSrc, float_v::size * sizeof(quint8) * 4);
}
-
const uint_v mask;
- const Vc::float_v uint8Max;
- const Vc::float_v uint8Rec1;
+ const float_v uint8Max;
+ const float_v uint8Rec1;
};
-template<Vc::Implementation _impl>
-struct PixelWrapper<float, _impl>
-{
+template<typename _impl>
+struct PixelWrapper<float, _impl> {
+ using int_v = xsimd::batch<int, _impl>;
+ using uint_v = xsimd::batch<unsigned int, _impl>;
+ using float_v = xsimd::batch<float, _impl>;
+
+ static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
+ static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
+
struct Pixel {
float red;
float green;
@@ -689,112 +690,89 @@ struct PixelWrapper<float, _impl>
};
ALWAYS_INLINE
- static float lerpMixedUintFloat(float a, float b, float alpha) {
+ static float lerpMixedUintFloat(float a, float b, float alpha)
+ {
return Arithmetic::lerp(a,b,alpha);
}
ALWAYS_INLINE
- static float roundFloatToUint(float x) {
+ static float roundFloatToUint(float x)
+ {
return x;
}
ALWAYS_INLINE
- static void normalizeAlpha(float &alpha) {
+ static void normalizeAlpha(float &alpha)
+ {
Q_UNUSED(alpha);
}
ALWAYS_INLINE
- static void denormalizeAlpha(float &alpha) {
+ static void denormalizeAlpha(float &alpha)
+ {
Q_UNUSED(alpha);
}
- PixelWrapper()
- : indexes(Vc::IndexesFromZero)
- {
- }
+ PixelWrapper() = default;
- ALWAYS_INLINE
- void read(quint8 *dstPtr, Vc::float_v &dst_c1, Vc::float_v &dst_c2, Vc::float_v &dst_c3, Vc::float_v &dst_alpha)
+ // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+ ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
{
- Vc::InterleavedMemoryWrapper<Pixel, Vc::float_v> dataDst(reinterpret_cast<Pixel*>(dstPtr));
- tie(dst_c1, dst_c2, dst_c3, dst_alpha) = dataDst[indexes];
+ const auto srcPtr = reinterpret_cast<const typename float_v::value_type *>(src);
+ const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4; // stride == 4
+ const auto idx2 = idx1 + 1;
+ const auto idx3 = idx1 + 2;
+ const auto idx4 = idx1 + 3;
+ dst_c1 = float_v::gather(srcPtr, idx1);
+ dst_c2 = float_v::gather(srcPtr, idx2);
+ dst_c3 = float_v::gather(srcPtr, idx3);
+ dst_alpha = float_v::gather(srcPtr, idx4);
}
- ALWAYS_INLINE
- void write(quint8 *dstPtr, Vc::float_v &dst_c1, Vc::float_v &dst_c2, Vc::float_v &dst_c3, Vc::float_v &dst_alpha)
+ ALWAYS_INLINE void
+ // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+ write(void *dst, const float_v &src_c1, const float_v &src_c2, const float_v &src_c3, const float_v &src_alpha)
{
- Vc::InterleavedMemoryWrapper<Pixel, Vc::float_v> dataDst(reinterpret_cast<Pixel*>(dstPtr));
- dataDst[indexes] = tie(dst_c1, dst_c2, dst_c3, dst_alpha);
+ auto dstPtr = reinterpret_cast<typename float_v::value_type *>(dst);
+
+ const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4; // stride == 4
+ const auto idx2 = idx1 + 1;
+ const auto idx3 = idx1 + 2;
+ const auto idx4 = idx1 + 3;
+
+ src_c1.scatter(dstPtr, idx1);
+ src_c2.scatter(dstPtr, idx2);
+ src_c3.scatter(dstPtr, idx3);
+ src_alpha.scatter(dstPtr, idx4);
}
ALWAYS_INLINE
- void clearPixels(quint8 *dataDst) {
- memset(dataDst, 0, Vc::float_v::size() * sizeof(float) * 4);
+ void clearPixels(quint8 *dataDst)
+ {
+ memset(dataDst, 0, float_v::size * sizeof(float) * 4);
}
ALWAYS_INLINE
- void copyPixels(const quint8 *dataSrc, quint8 *dataDst) {
- memcpy(dataDst, dataSrc, Vc::float_v::size() * sizeof(float) * 4);
+ void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
+ {
+ memcpy(dataDst, dataSrc, float_v::size * sizeof(float) * 4);
}
-
- const Vc::float_v::IndexType indexes;
};
-namespace KoStreamedMathFunctions {
-
-template<int pixelSize>
-ALWAYS_INLINE void clearPixel(quint8* dst);
-
-template<>
-ALWAYS_INLINE void clearPixel<4>(quint8* dst)
-{
- quint32 *d = reinterpret_cast<quint32*>(dst);
- *d = 0;
-}
-
-template<>
-ALWAYS_INLINE void clearPixel<8>(quint8* dst)
+namespace KoStreamedMathFunctions
{
- quint64 *d = reinterpret_cast<quint64*>(dst);
- d[0] = 0;
-}
-
-template<>
-ALWAYS_INLINE void clearPixel<16>(quint8* dst)
-{
- quint64 *d = reinterpret_cast<quint64*>(dst);
- d[0] = 0;
- d[1] = 0;
-}
-
template<int pixelSize>
-ALWAYS_INLINE void copyPixel(const quint8 *src, quint8* dst);
-
-template<>
-ALWAYS_INLINE void copyPixel<4>(const quint8 *src, quint8* dst)
+ALWAYS_INLINE void clearPixel(quint8 *dst)
{
- const quint32 *s = reinterpret_cast<const quint32*>(src);
- quint32 *d = reinterpret_cast<quint32*>(dst);
- *d = *s;
+ std::memset(dst, 0, pixelSize);
}
-template<>
-ALWAYS_INLINE void copyPixel<8>(const quint8 *src, quint8* dst)
-{
- const quint64 *s = reinterpret_cast<const quint64*>(src);
- quint64 *d = reinterpret_cast<quint64*>(dst);
- d[0] = s[0];
-}
-
-template<>
-ALWAYS_INLINE void copyPixel<16>(const quint8 *src, quint8* dst)
+template<int pixelSize>
+ALWAYS_INLINE void copyPixel(const quint8 *src, quint8 *dst)
{
- const quint64 *s = reinterpret_cast<const quint64*>(src);
- quint64 *d = reinterpret_cast<quint64*>(dst);
- d[0] = s[0];
- d[1] = s[1];
-}
+ std::memcpy(dst, src, pixelSize);
}
+} // namespace KoStreamedMathFunctions
#endif /* __KOSTREAMED_MATH_H */
diff --git a/libs/pigment/compositeops/KoVcMultiArchBuildSupport.h b/libs/pigment/compositeops/KoVcMultiArchBuildSupport.h
deleted file mode 100644
index 59765085f9..0000000000
--- a/libs/pigment/compositeops/KoVcMultiArchBuildSupport.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73 at gmail.com>
- *
- * SPDX-License-Identifier: LGPL-2.1-or-later
- */
-
-#ifndef __KOVCMULTIARCHBUILDSUPPORT_H
-#define __KOVCMULTIARCHBUILDSUPPORT_H
-
-#include "config-vc.h"
-
-#ifdef HAVE_VC
-
-#if defined(__clang__)
-
-#pragma GCC diagnostic ignored "-Wlocal-type-template-args"
-#endif
-
-#if defined _MSC_VER
-// Lets shut up the "possible loss of data" and "forcing value to bool 'true' or 'false'
-#pragma warning ( push )
-#pragma warning ( disable : 4146 ) // avx/detail.h
-#pragma warning ( disable : 4244 )
-#pragma warning ( disable : 4267 ) // interleavedmemory
-#pragma warning ( disable : 4800 )
-#endif
-#include <Vc/global.h>
-#include <Vc/Vc>
-#include <Vc/support.h>
-#if defined _MSC_VER
-#pragma warning ( pop )
-#endif
-
-#else /* HAVE_VC */
-
-namespace Vc {
- enum Implementation /*: std::uint_least32_t*/ {
- ScalarImpl,
- };
- class CurrentImplementation {
- public:
- static constexpr Implementation current()
- {
- return static_cast<Implementation>(ScalarImpl);
- }
- };
-}
-
-
-#endif /* HAVE_VC */
-
-
-#include <QDebug>
-#include <ksharedconfig.h>
-#include <kconfig.h>
-#include <kconfiggroup.h>
-
-template<class FactoryType>
-typename FactoryType::ReturnType
-createOptimizedClass(typename FactoryType::ParamType param)
-{
- static bool isConfigInitialized = false;
- static bool useVectorization = true;
- static bool disableAVXOptimizations = false;
-
- if (!isConfigInitialized) {
- KConfigGroup cfg = KSharedConfig::openConfig()->group("");
- // use the old key name for compatibility
- useVectorization = !cfg.readEntry("amdDisableVectorWorkaround", false);
- disableAVXOptimizations = cfg.readEntry("disableAVXOptimizations", false);
- isConfigInitialized = true;
- }
-
- if (!useVectorization) {
- qWarning() << "WARNING: vector instructions disabled by the \'amdDisableVectorWorkaround\' option!";
- return FactoryType::template create<Vc::ScalarImpl>(param);
- }
-
-#ifdef HAVE_VC
- if (disableAVXOptimizations &&
- (Vc::isImplementationSupported(Vc::AVXImpl) ||
- Vc::isImplementationSupported(Vc::AVX2Impl))) {
- qWarning() << "WARNING: AVX and AVX2 optimizations are disabled by the \'disableAVXOptimizations\' option!";
- }
-
- /**
- * We use SSE2, SSSE3, SSE4.1, AVX and AVX2.
- * The rest are integer and string instructions mostly.
- *
- * TODO: Add FMA3/4 when it is adopted by Vc
- */
- if (!disableAVXOptimizations && Vc::isImplementationSupported(Vc::AVX2Impl)) {
- return FactoryType::template create<Vc::AVX2Impl>(param);
- } else if (!disableAVXOptimizations && Vc::isImplementationSupported(Vc::AVXImpl)) {
- return FactoryType::template create<Vc::AVXImpl>(param);
- } else if (Vc::isImplementationSupported(Vc::SSE41Impl)) {
- return FactoryType::template create<Vc::SSE41Impl>(param);
- } else if (Vc::isImplementationSupported(Vc::SSSE3Impl)) {
- return FactoryType::template create<Vc::SSSE3Impl>(param);
- } else if (Vc::isImplementationSupported(Vc::SSE2Impl)) {
- return FactoryType::template create<Vc::SSE2Impl>(param);
- } else {
-#endif
- (void)disableAVXOptimizations;
- return FactoryType::template create<Vc::ScalarImpl>(param);
-#ifdef HAVE_VC
- }
-#endif
-
-}
-
-template<class FactoryType>
-typename FactoryType::ReturnType
-createOptimizedClass(typename FactoryType::ParamType param, bool forceScalarImplemetation)
-{
- if(forceScalarImplemetation){
- return FactoryType::template create<Vc::ScalarImpl>(param);
- }
- return createOptimizedClass<FactoryType>(param);
-}
-
-#endif /* __KOVCMULTIARCHBUILDSUPPORT_H */
diff --git a/libs/pigment/tests/KoRgbU8ColorSpaceTester.cpp b/libs/pigment/tests/KoRgbU8ColorSpaceTester.cpp
index b6576c2312..8bd05b8786 100644
--- a/libs/pigment/tests/KoRgbU8ColorSpaceTester.cpp
+++ b/libs/pigment/tests/KoRgbU8ColorSpaceTester.cpp
@@ -296,7 +296,7 @@ void KoRgbU8ColorSpaceTester::testCompositeOpsWithChannelFlags()
// for posix_memalign()
#include <stdlib.h>
-#include <config-vc.h>
+#include <config-xsimd.h>
#if defined Q_OS_WIN
#define MEMALIGN_ALLOC(p, a, s) ((*(p)) = _aligned_malloc((s), (a)), *(p) ? 0 : errno)
@@ -362,10 +362,10 @@ void KoRgbU8ColorSpaceTester::testCompositeCopyDivisionByZero()
qDebug() << "oriD" << badDst[0] << badDst[1] << badDst[2] << badDst[3];
qDebug() << "expD" << expectedDst[0] << expectedDst[1] << expectedDst[2] << expectedDst[3];
qDebug() << "dst1" << badPixelDstPtr[0] << badPixelDstPtr[1] << badPixelDstPtr[2] << badPixelDstPtr[3];
-#if HAVE_VC
+#if HAVE_XSIMD
QFAIL("Failed to compose pixels");
#else
- qWarning() << "Skipping failed test when Vc library is not used";
+ qWarning() << "Skipping failed test when xsimd library is not used";
#endif
}
};
More information about the kimageshop
mailing list