[calligra/kexi-altertable-staniek] krita/plugins/paintops/libbrush: Optimized Auto Brush mask filling code

Wed Oct 31 11:29:34 UTC 2012

Git commit 8127e2a70646f67838dc788ae7f5c91a69aafd25 by Jaroslaw Staniek, on behalf of Dmitry Kazakov.
Committed on 10/10/2012 at 12:30.
Pushed by staniek into branch 'kexi-altertable-staniek'.

Optimized Auto Brush mask filling code

This patch uses internal cpu parallelism and makes the code execute much
faster in the 'KisStrokeBenchmark pixelbrush300pxRL' benchmark.

Actual results in the benchmark:
Sandy Bridge (Core i7-2600): +25%
Merom (Core 2 Duo T7250): +10%

According to VTune the painting should have become up to 10% faster
(on Sandy Bridge), because now this part of code consumes almost no time.

This optimization will work most on the highest precision levels, that is
when a dab cannot be cached.

CCMAIL:kimageshop at kde.org

M  +90   -13   krita/plugins/paintops/libbrush/kis_auto_brush.cpp

http://commits.kde.org/calligra/8127e2a70646f67838dc788ae7f5c91a69aafd25

diff --git a/krita/plugins/paintops/libbrush/kis_auto_brush.cpp b/krita/plugins/paintops/libbrush/kis_auto_brush.cpp
index 4449d9f..34a0b76 100644
--- a/krita/plugins/paintops/libbrush/kis_auto_brush.cpp
+++ b/krita/plugins/paintops/libbrush/kis_auto_brush.cpp
@@ -237,7 +237,86 @@ KisAutoBrush::~KisAutoBrush()
     delete d;
 }
 
+inline void fillPixelOptimized_4bytes(quint8 *color, quint8 *buf, int size)
+{
+    /**
+     * This version of filling uses low granularity of data transfers
+     * (32-bit chunks) and internal processor's parallelism. It reaches
+     * 25% better performance in KisStrokeBenchmark in comparison to
+     * per-pixel memcpy version (tested on Sandy Bridge).
+     */
+
+    int block1 = size / 8;
+    int block2 = size % 8;
+
+    quint32 *src = reinterpret_cast<quint32*>(color);
+    quint32 *dst = reinterpret_cast<quint32*>(buf);
+
+    // check whether all buffers are 4 bytes aligned
+    // (uncomment if experience some problems)
+    // Q_ASSERT(((qint64)src & 3) == 0);
+    // Q_ASSERT(((qint64)dst & 3) == 0);
+
+    for (int i = 0; i < block1; i++) {
+        *dst = *src;
+        *(dst+1) = *src;
+        *(dst+2) = *src;
+        *(dst+3) = *src;
+        *(dst+4) = *src;
+        *(dst+5) = *src;
+        *(dst+6) = *src;
+        *(dst+7) = *src;
+
+        dst += 8;
+    }
+
+    for (int i = 0; i < block2; i++) {
+        *dst = *src;
+        dst++;
+    }
+}
 
+inline void fillPixelOptimized_general(quint8 *color, quint8 *buf, int size, int pixelSize)
+{
+    /**
+     * This version uses internal processor's parallelism and gives
+     * 20% better performance in KisStrokeBenchmark in comparison to
+     * per-pixel memcpy version (tested on Sandy Bridge (+20%) and
+     * on Merom (+10%)).
+     */
+
+    int block1 = size / 8;
+    int block2 = size % 8;
+
+    for (int i = 0; i < block1; i++) {
+        quint8 *d1 = buf;
+        quint8 *d2 = buf + pixelSize;
+        quint8 *d3 = buf + 2 * pixelSize;
+        quint8 *d4 = buf + 3 * pixelSize;
+        quint8 *d5 = buf + 4 * pixelSize;
+        quint8 *d6 = buf + 5 * pixelSize;
+        quint8 *d7 = buf + 6 * pixelSize;
+        quint8 *d8 = buf + 7 * pixelSize;
+
+        for (int j = 0; j < pixelSize; j++) {
+            *(d1 + j) = color[j];
+            *(d2 + j) = color[j];
+            *(d3 + j) = color[j];
+            *(d4 + j) = color[j];
+            *(d5 + j) = color[j];
+            *(d6 + j) = color[j];
+            *(d7 + j) = color[j];
+            *(d8 + j) = color[j];
+        }
+
+        buf += 8 * pixelSize;
+    }
+
+    for (int i = 0; i < block2; i++) {
+        memcpy(buf, color, pixelSize);
+        buf += pixelSize;
+    }
+}
 
 void KisAutoBrush::generateMaskAndApplyMaskOrCreateDab(KisFixedPaintDeviceSP dst,
         KisBrush::ColoringInformation* coloringInformation,
@@ -299,23 +378,21 @@ void KisAutoBrush::generateMaskAndApplyMaskOrCreateDab(KisFixedPaintDeviceSP dst
 
     d->shape->setSoftness( softnessFactor );
 
-    for (int y = 0; y < dstHeight; y++) {
-        for (int x = 0; x < dstWidth; x++) {
-
-            if (coloringInformation) {
-                if (color) {
-                    memcpy(dabPointer, color, pixelSize);
-                } else {
+    if (coloringInformation) {
+        if (color && pixelSize == 4) {
+            fillPixelOptimized_4bytes(color, dabPointer, dstWidth * dstHeight);
+        } else if (color) {
+            fillPixelOptimized_general(color, dabPointer, dstWidth * dstHeight, pixelSize);
+        } else {
+            for (int y = 0; y < dstHeight; y++) {
+                for (int x = 0; x < dstWidth; x++) {
                     memcpy(dabPointer, coloringInformation->color(), pixelSize);
                     coloringInformation->nextColumn();
                 }
+                coloringInformation->nextRow();
             }
-            dabPointer += pixelSize;
-            }//endfor x
-            if (!color && coloringInformation) {
-            coloringInformation->nextRow();
-            }
-    }//endfor y
+        }
+    }
 
     MaskProcessor s(dst, cs, d->randomness, d->density, centerX, centerY, invScaleX, invScaleY, angle, d->shape);
     int jobs = d->idealThreadCountCached;