[Kst] branches/work/kst/portto4/kst/src/datasources/ascii

Peter Kümmel syntheticpp at gmx.net
Tue Oct 16 14:47:48 UTC 2012


SVN commit 1320704 by kuemmel:

read big files into multiple chunks

 M  +6 -5      asciidatareader.cpp  
 M  +5 -4      asciidatareader.h  
 M  +144 -20   asciifilebuffer.cpp  
 M  +39 -5     asciifilebuffer.h  
 M  +11 -2     asciisource.cpp  


--- branches/work/kst/portto4/kst/src/datasources/ascii/asciidatareader.cpp #1320703:1320704
@@ -20,6 +20,7 @@
 #include "measuretime.h"
 
 #include <QFile>
+#include <QDebug>
 #include <ctype.h>
 #include <stdlib.h>
 
@@ -114,13 +115,13 @@
   detectLineEndingType(file);
 
   bool new_data = false;
-  AsciiFileBuffer buf;
+  AsciiFileData buf;
   do {
     // Read the tmpbuffer, starting at row_index[_numFrames]
     buf.clear();
 
     // always read from the start of a line
-    buf.read(file, _rowIndex[_numFrames], _byteLength - buf.begin(), AsciiFileBuffer::Prealloc - 1);
+    buf.read(file, _rowIndex[_numFrames], _byteLength - buf.begin(), AsciiFileData::Prealloc - 1);
     if (buf.bytesRead() == 0) {
       return false;
     }
@@ -147,7 +148,7 @@
         new_data = findDataRows(buf.constData(), buf.begin(), buf.bytesRead(), IsLineBreakCR(_lineending), comment_del);
       }
     }
-  } while (buf.bytesRead() == AsciiFileBuffer::Prealloc - 1  && read_completely);
+  } while (buf.bytesRead() == AsciiFileData::Prealloc - 1  && read_completely);
 
   _rowIndex.resize(_numFrames + 1);
 
@@ -173,7 +174,7 @@
         _rowIndex[_numFrames] = row_start;
         ++_numFrames;
         if (_numFrames >= _rowIndex.size()) {
-          _rowIndex.resize(_rowIndex.size() + AsciiFileBuffer::Prealloc - 1);
+          _rowIndex.resize(_rowIndex.size() + AsciiFileData::Prealloc - 1);
         }
         new_data = true;
         row_start = row_offset+i;
@@ -191,7 +192,7 @@
 }
 
 //-------------------------------------------------------------------------------------------
-int AsciiDataReader::readField(const AsciiFileBuffer& buf, int col, double *v, const QString& field, int s, int n)
+int AsciiDataReader::readField(const AsciiFileData& buf, int col, double *v, const QString& field, int s, int n)
 {
   if (_config._columnType == AsciiSourceConfig::Fixed) {
     MeasureTime t("AsciiSource::readField: same width for all columns");
--- branches/work/kst/portto4/kst/src/datasources/ascii/asciidatareader.h #1320703:1320704
@@ -29,21 +29,22 @@
     AsciiDataReader(AsciiSourceConfig& config);
     ~AsciiDataReader();
 
-    typedef QVarLengthArray<int, AsciiFileBuffer::Prealloc> RowIndex;
-
     void clear();
     void setRow0Begin(int begin);
     inline int beginOfRow(int row) const { return _rowIndex[row]; }
     inline int numberOfFrames() const { return _numFrames; }
 
+    // where
+    const AsciiFileBuffer::RowIndex& rowIndex() const { return _rowIndex; }
+    
     void detectLineEndingType(QFile& file);
     
     bool findDataRows(bool read_completely, QFile& file, int _byteLength);
-    int readField(const AsciiFileBuffer &buf, int col, double *v, const QString& field, int s, int n);
+    int readField(const AsciiFileData &buf, int col, double *v, const QString& field, int s, int n);
 
   private:
     int _numFrames;
-    RowIndex _rowIndex;
+    AsciiFileBuffer::RowIndex _rowIndex;
     AsciiSourceConfig& _config;
     AsciiCharacterTraits::LineEndingType _lineending;
 
--- branches/work/kst/portto4/kst/src/datasources/ascii/asciifilebuffer.cpp #1320703:1320704
@@ -26,21 +26,21 @@
 #undef qFree
 
 #include "asciifilebuffer.h"
-
 #include "debug.h"
 
 #include <QFile>
 #include <QDebug>
-#include <QMap>
 
+
 static int MB = 1024*1024;
 
 // Simulate out of memory scenario
 //#define KST_TEST_OOM
+
 #ifdef KST_TEST_OOM
-static const size_t maxMB = 50;
+static size_t maxAllocate = 2 * MB;
 #else
-static const size_t maxMB = 0;
+static size_t maxAllocate = (size_t) -1;
 #endif
 
 #define KST_MEMORY_DEBUG if(1)
@@ -57,25 +57,26 @@
     it.next();
     sum +=  it.value();
   }
-  Kst::Debug::self()->log(QString("AsciiFileBuffer: %1 MB used").arg(sum/MB), Kst::Debug::Warning);
-  KST_MEMORY_DEBUG qDebug() << "AsciiFileBuffer: " << sum/MB<< "MB used";
+  Kst::Debug::self()->log(QString("AsciiFileData: %1 MB used").arg(sum / MB), Kst::Debug::Warning);
+  KST_MEMORY_DEBUG qDebug() << "AsciiFileData: " << sum / MB<< "MB used";
 }
 
 //-------------------------------------------------------------------------------------------
 void* fileBufferMalloc(size_t bytes)
 {
   void* ptr = 0;
-  if (maxMB == 0 || bytes < maxMB*MB) {
+#ifdef KST_TEST_OOM
+  if (bytes <= maxAllocate)
+#endif
     ptr = malloc(bytes);
-  }
   if (ptr)  {
     allocatedMBs[ptr] = bytes;
     KST_MEMORY_DEBUG qDebug() << "AsciiFileBuffer: " << bytes/MB << "MB allocated";
     KST_MEMORY_DEBUG logMemoryUsed();
   } else {
-    Kst::Debug::self()->log(QString("AsciiFileBuffer: failed to allocate %1 MBs").arg(bytes/MB), Kst::Debug::Warning);
+    Kst::Debug::self()->log(QString("AsciiFileData: failed to allocate %1 MBs").arg(bytes / MB), Kst::Debug::Warning);
     logMemoryUsed();
-    KST_MEMORY_DEBUG qDebug() << "AsciiFileBuffer: error when allocating " << bytes/MB << "MB";
+    KST_MEMORY_DEBUG qDebug() << "AsciiFileData: error when allocating " << bytes / MB << "MB";
   }
   return ptr;
 }
@@ -84,7 +85,7 @@
 void fileBufferFree(void* ptr)
 {
   if (allocatedMBs.contains(ptr)) {
-    KST_MEMORY_DEBUG qDebug() << "AsciiFileBuffer: " << allocatedMBs[ptr]/MB << "MB freed";
+    KST_MEMORY_DEBUG qDebug() << "AsciiFileData: " << allocatedMBs[ptr] / MB << "MB freed";
     allocatedMBs.remove(ptr);
   }
   KST_MEMORY_DEBUG logMemoryUsed();
@@ -92,31 +93,29 @@
 }
 
 //-------------------------------------------------------------------------------------------
-AsciiFileBuffer::AsciiFileBuffer() : _array(new Array), _begin(-1), _bytesRead(0)
+AsciiFileData::AsciiFileData() : _array(new Array), _begin(-1), _bytesRead(0), _rowBegin(-1), _rowsRead(0)
 {
 }
 
 //-------------------------------------------------------------------------------------------
-AsciiFileBuffer::~AsciiFileBuffer()
+AsciiFileData::~AsciiFileData()
 {
-  delete _array;
 }
 
-
 //-------------------------------------------------------------------------------------------
-char* AsciiFileBuffer::data()
+char* AsciiFileData::data()
 {
   return _array->data();
 }
 
 //-------------------------------------------------------------------------------------------
-const char* const AsciiFileBuffer::constPointer() const
+const char* const AsciiFileData::constPointer() const
 {
   return _array->data();
 }
 
 //-------------------------------------------------------------------------------------------
-bool AsciiFileBuffer::resize(int bytes)
+bool AsciiFileData::resize(int bytes)
 { 
   try {
     _array->resize(bytes);
@@ -129,7 +128,7 @@
 }
 
 //-------------------------------------------------------------------------------------------
-void AsciiFileBuffer::clear(bool forceDeletingArray)
+void AsciiFileData::clear(bool forceDeletingArray)
 {
   // force deletion of heap allocated memory if any
   if (forceDeletingArray || _array->capacity() > Prealloc) {
@@ -141,11 +140,20 @@
 }
 
 //-------------------------------------------------------------------------------------------
-void AsciiFileBuffer::read(QFile& file, int start, int bytesToRead, int maximalBytes)
+void AsciiFileData::release()
 {
+  delete _array;
+  _array = 0;
   _begin = -1;
   _bytesRead = 0;
+}
 
+//-------------------------------------------------------------------------------------------
+void AsciiFileData::read(QFile& file, int start, int bytesToRead, int maximalBytes)
+{
+  _begin = -1;
+  _bytesRead = 0;
+
   if (bytesToRead <= 0)
     return;
 
@@ -167,3 +175,119 @@
   _bytesRead = bytesRead;
 }
 
+
+//-------------------------------------------------------------------------------------------
+AsciiFileBuffer::AsciiFileBuffer()
+{
+}
+
+//-------------------------------------------------------------------------------------------
+AsciiFileBuffer::~AsciiFileBuffer()
+{
+  clear();
+}
+
+//-------------------------------------------------------------------------------------------
+void AsciiFileBuffer::clear(bool forceDeletingArray)
+{
+  foreach (AsciiFileData chunk, _fileData) {
+    chunk.release();
+  }
+  _fileData.clear();
+}
+
+//-------------------------------------------------------------------------------------------
+const QVector<AsciiFileData>& AsciiFileBuffer::data() const
+{
+  return _fileData;
+}
+
+//-------------------------------------------------------------------------------------------
+void AsciiFileBuffer::logData() const
+{
+  int i = 0;
+  foreach (const AsciiFileData& chunk, _fileData) {
+    qDebug() << "_fileData: " << i << ". " << chunk.rowBegin() << " ... " << chunk.rowBegin() + chunk.rowsRead();
+    i++;
+  }
+}
+
+//-------------------------------------------------------------------------------------------
+static int findRowOfPosition(const AsciiFileBuffer::RowIndex& rowIndex, int searchStart, int pos)
+{
+  //TODO too expensive
+  const int size = rowIndex.size();
+  for (int row = searchStart; row != size; row++) {
+    if (rowIndex[row] > pos)
+      return row - 1;
+  }
+  // must be the last row
+  return size - 1;
+}
+
+//-------------------------------------------------------------------------------------------
+void AsciiFileBuffer::read(QFile& file, const RowIndex& rowIndex, int start, int bytesToRead, int maximalBytes)
+{
+  _begin = -1;
+  _bytesRead = 0;
+  _fileData.clear();
+
+  // first try to read the whole file into one array
+  AsciiFileData wholeFile;
+  wholeFile.read(file, start, bytesToRead, maximalBytes);
+  if (bytesToRead == wholeFile.bytesRead()) {
+    wholeFile.setRowBegin(0);
+    wholeFile.setRowsRead(rowIndex.size());
+    _begin = start;
+    _bytesRead = bytesToRead;
+    _fileData << wholeFile;
+    return;
+  } else {
+    wholeFile.release();
+  }
+
+
+  // reading whole file into one array failed, try to read into smaller arrays
+  int chunkSize = qMin((size_t) 10 * MB, maxAllocate);
+  int end = start + bytesToRead;
+  int chunkRead = 0;
+  int row = 0;
+  for (int pos = start; pos < end; pos += chunkRead) {
+    AsciiFileData chunk;
+    // remember first row index
+    chunk.setRowBegin(row);
+    // read complete chunk or to end of file
+    chunkRead = (pos + chunkSize < end ? chunkSize : end - pos);  
+    // adjust to row end: pos + chunkRead is in the middle of a row, find index of this row
+    row = findRowOfPosition(rowIndex, row, pos + chunkRead);
+    // read until the beginning of this row
+    chunkRead = (rowIndex[row] - 1);
+    // check if it is the last row, and read remaining bytes from pos
+    chunkRead = (row == rowIndex.size() - 1) ? end - pos : chunkRead - pos;
+    // read the rows
+    chunk.read(file, pos, chunkRead);
+    if (chunkRead != chunk.bytesRead()) {
+      Kst::Debug::self()->log(QString("AsciiFileBuffer: error when reading into chunk"));
+      chunk.release();
+      break;
+    }
+    // remember number of read rows
+    chunk.setRowsRead(row - chunk.rowBegin());
+    _fileData << chunk;
+    _bytesRead += chunk.bytesRead();
+  }
+  if (_bytesRead == bytesToRead) {
+    _begin = start;
+    return;
+  } else {
+    _bytesRead = 0;
+    _fileData.clear();
+    Kst::Debug::self()->log(QString("AsciiFileBuffer: error while reading %1 chunks").arg(_fileData.size()));
+  }
+
+  // sliding window
+  // TODO
+}
+
+
+
--- branches/work/kst/portto4/kst/src/datasources/ascii/asciifilebuffer.h #1320703:1320704
@@ -13,12 +13,14 @@
 #ifndef ASCII_FILE_BUFFER_H
 #define ASCII_FILE_BUFFER_H
 
+#include <QVector>
+
 template<class T, int Prealloc>
 class QVarLengthArray;
 
 class QFile;
 
-class AsciiFileBuffer
+class AsciiFileData
 {
 public:
 
@@ -34,14 +36,12 @@
 
   typedef QVarLengthArray<char, Prealloc> Array;
   
-  AsciiFileBuffer();
-  ~AsciiFileBuffer();
+  AsciiFileData();
+  ~AsciiFileData();
 
   inline int begin() const { return _begin; }
   inline int bytesRead() const { return _bytesRead; }
-
   void read(QFile&, int start, int numberOfBytes, int maximalBytes = -1);
-
   char* data();
 
   const char* const constPointer() const;
@@ -49,13 +49,47 @@
 
   bool resize(int size);
   void clear(bool forceDeletingArray = false);
+  void release();
 
+  inline int rowBegin() const { return _rowBegin; }
+  inline int rowsRead() const { return _rowsRead; }
+  inline void setRowBegin(int begin) { _rowBegin = begin; }
+  inline void setRowsRead(int read) { _rowsRead = read; }
+
 private:
   Array* _array;
   int _begin;
   int _bytesRead;
+  int _rowBegin;
+  int _rowsRead;
 };
 
+Q_DECLARE_TYPEINFO(AsciiFileData, Q_MOVABLE_TYPE);
 
+
+class AsciiFileBuffer
+{
+public:
+  AsciiFileBuffer();
+  ~AsciiFileBuffer();
+  
+  typedef QVarLengthArray<int, AsciiFileData::Prealloc> RowIndex;
+
+  inline int begin() const { return _begin; }
+  inline int bytesRead() const { return _bytesRead; }
+
+  void clear(bool forceDeletingArray = false);
+  
+  void read(QFile&, const RowIndex& rowIndex, int start, int numberOfBytes, int maximalBytes = -1);
+  
+  const QVector<AsciiFileData>& data() const;
+  
+private:
+  QVector<AsciiFileData> _fileData;
+  int _begin;
+  int _bytesRead;
+  void logData() const;
+};
+
 #endif
 // vim: ts=2 sw=2 et
--- branches/work/kst/portto4/kst/src/datasources/ascii/asciisource.cpp #1320703:1320704
@@ -242,6 +242,7 @@
   
   // reading whole file into memory failed
   
+  /*
   // find a smaller allocatable size
   _fileBuffer.clear();
   int realloc_size = n / 4;
@@ -271,6 +272,8 @@
   // don't buffer partial files
   _fileBuffer.clear();
   return n_read;
+  */
+  return 0;
 }
 
 
@@ -302,7 +305,7 @@
     if (!openValidFile(file)) {
       return 0;
     }
-    _fileBuffer.read(file, begin, bytesToRead);
+    _fileBuffer.read(file, _reader.rowIndex(), begin, bytesToRead);
     if (_fileBuffer.bytesRead() == 0) {
       success = false;
       return 0;
@@ -310,10 +313,16 @@
     _reader.detectLineEndingType(file);
   }
   
-  return _reader.readField(_fileBuffer, col, v, field, s, n);
+  int sRead = 0;
+  const QVector<AsciiFileData> data = _fileBuffer.data();
+  foreach (const AsciiFileData& chunk, data) {
+    sRead += _reader.readField(chunk, col, v + sRead, field, chunk.rowBegin(), chunk.rowsRead());
 }
 
+  return sRead;
+}
 
+
 //-------------------------------------------------------------------------------------------
 QString AsciiSource::fileType() const 
 {


More information about the Kst mailing list