[education/rkward] rkward: Split out RKParsedScript-class into separate files

Mon May 26 15:06:38 BST 2025

Git commit 730cc38a87681ebd788e8df87238d10ee3e470e8 by Thomas Friedrichsmeier.
Committed on 26/05/2025 at 14:06.
Pushed by tfry into branch 'master'.

Split out RKParsedScript-class into separate files

M  +1    -0    rkward/misc/CMakeLists.txt
A  +149  -0    rkward/misc/rkparsedscript.cpp     [License: GPL(v2.0+)]
A  +77   -0    rkward/misc/rkparsedscript.h     [License: GPL(v2.0+)]
M  +0    -178  rkward/windows/rkcommandeditorwindow.cpp

https://invent.kde.org/education/rkward/-/commit/730cc38a87681ebd788e8df87238d10ee3e470e8

diff --git a/rkward/misc/CMakeLists.txt b/rkward/misc/CMakeLists.txt
index ff0da775d..af1ee9f76 100644
--- a/rkward/misc/CMakeLists.txt
+++ b/rkward/misc/CMakeLists.txt
@@ -33,6 +33,7 @@ SET(misc_STAT_SRCS
    rkdialogbuttonbox.cpp
    rkoutputdirectory.cpp
    rkstyle.cpp
+   rkparsedscript.cpp
    rkparsedversion.cpp
    rkradiogroup.cpp
    rkrapimenu.cpp
diff --git a/rkward/misc/rkparsedscript.cpp b/rkward/misc/rkparsedscript.cpp
new file mode 100644
index 000000000..96b1c89d4
--- /dev/null
+++ b/rkward/misc/rkparsedscript.cpp
@@ -0,0 +1,149 @@
+/*
+celleditor - This file is part of the RKWard project. Created: Sat May 17 2025
+SPDX-FileCopyrightText: 2025 by Thomas Friedrichsmeier <thomas.friedrichsmeier at kdemail.net>
+SPDX-FileContributor: The RKWard Team <rkward-devel at kde.org>
+SPDX-License-Identifier: GPL-2.0-or-later
+*/
+#include "rkparsedscript.h"
+
+#include <QChar>
+
+#include <limits.h>
+
+#include "../debug.h"
+
+RKParsedScript::RKParsedScript(const QString &content) {
+	context_list.reserve(200); // just a very wild guess
+	addContext(Top, -1, content);
+};
+
+int RKParsedScript::addContext(ContextType type, int start, const QString &content) {
+	ContextType prevtype = context_list.empty() ? None : context_list.back().type;
+
+	int index = context_list.size();
+	// some contexts need (or benefit from) special handling depending on the preceding context
+	if (type == OtherOperator && prevtype == OtherOperator) {
+		// Merge any two subsequent operators into one token
+		// i.e. do not add a context, we'll reuse the previous one.
+		--index;
+	} else if (type == Delimiter && content.at(start) == u'\n' && (prevtype == OtherOperator || prevtype == SubsetOperator)) {
+		// newlines do not count as delimiter on operator RHS, so skip ahead, instead of really adding this
+		return start;
+	} else {
+		context_list.emplace_back(type, start); // end will be filled in, later
+	}
+
+	int pos = start;
+	if (type == SingleQuoted || type == DoubleQuoted || type == BackQuoted) {
+		while (++pos < content.length()) {
+			const QChar c = content.at(pos);
+			if (c == u'\\') ++pos;
+			else if (c == u'\'' && type == SingleQuoted) break;
+			else if (c == u'"' && type == DoubleQuoted) break;
+			else if (c == u'`' && type == BackQuoted) break;
+		}
+	} else if (type == AnySymbol) {
+		while (++pos < content.length()) {
+			const QChar c = content.at(pos);
+			if (!c.isLetterOrNumber() && c != u'.') {
+				--pos;
+				break;
+			}
+		}
+	} else if (type == Comment) {
+		while (++pos < content.length()) {
+			if (content.at(pos) == u'\n') break;
+		}
+	} else if (type == OtherOperator || type == SubsetOperator || type == Delimiter) {
+		// leave context, immediately
+	} else {
+		while (++pos < content.length()) {
+			QChar c = content.at(pos);
+			if (c == u'\'') pos = addContext(SingleQuoted, pos, content);
+			else if (c == u'"') pos = addContext(DoubleQuoted, pos, content);
+			else if (c == u'`') pos = addContext(BackQuoted, pos, content);
+			else if (c == u'#') pos = addContext(Comment, pos, content);
+			else if (c == u'(') pos = addContext(Parenthesis, pos, content);
+			else if (c == u')' && type == Parenthesis) break;
+			else if (c == u'{') pos = addContext(Brace, pos, content);
+			else if (c == u'}' && type == Brace) break;
+			else if (c == u'[') pos = addContext(Bracket, pos, content);
+			else if (c == u']' && type == Bracket) break;
+			else if (c.isLetterOrNumber() || c == u'.') pos = addContext(AnySymbol, pos, content);
+			else if (c == u'\n' || c == u',' || c == u';') pos = addContext(Delimiter, pos, content);
+			else if (c == u'$' || c == u'@') pos = addContext(SubsetOperator, pos, content);
+			else if (!c.isSpace()) pos = addContext(OtherOperator, pos, content);
+		}
+	}
+
+	// NOTE: we can't just keep a reference to the context at the start of this function, as the vector
+	//       may re-allocate during nested parsing
+	context_list.at(index).end = pos;
+	return pos;
+};
+
+int RKParsedScript::contextAtPos(int pos) const {
+	// Context 0 is Top, not really of interest
+	for (int i = 1; i < context_list.size(); ++i) {
+		if (context_list.at(i).start > pos) {
+			return i - 1;
+		}
+	}
+	return 0;
+}
+
+// NOTE: used in debugging, only
+QString RKParsedScript::serialize() const {
+	QString ret;
+	std::vector<Context> stack;
+	stack.push_back(Context(None, -1, INT_MAX)); // dummy context, to avoid empty stack
+
+	for (unsigned int i = 0; i < context_list.size(); ++i) {
+		const auto ctx = context_list.at(i);
+
+		// end any finished contexts
+		while (ctx.start >= stack.back().end) {
+			ret += serializeContextEnd(stack.back(), stack.size());
+			stack.pop_back();
+		}
+
+		// now deal with the current context
+		stack.push_back(ctx);
+		const auto type = ctx.type;
+		if (type == Parenthesis) ret += u'(';
+		if (type == Brace) ret += u'{';
+		if (type == Bracket) ret += u'[';
+		if (type == SingleQuoted) ret += u'\'';
+		if (type == DoubleQuoted) ret += u'"';
+		if (type == BackQuoted) ret += u'`';
+		if (type == Comment) ret += u'#';
+		if (type == SubsetOperator) ret += u'$';
+		if (type == OtherOperator) ret += u'+';
+		if (type == AnySymbol) ret += u'x';
+	}
+	while (!stack.empty()) {
+		ret += serializeContextEnd(stack.back(), stack.size());
+		stack.pop_back();
+	}
+
+	return ret;
+}
+
+using namespace Qt::Literals::StringLiterals;
+
+QString RKParsedScript::serializeContextEnd(const Context &ctx, int level) const {
+	const auto ptype = ctx.type;
+
+	if (ptype == Parenthesis) return u")"_s;
+	if (ptype == Brace) return u"}"_s;
+	if (ptype == Bracket) return u"]"_s;
+	if (ptype == SingleQuoted) return u"'"_s;
+	if (ptype == DoubleQuoted) return u"\""_s;
+	if (ptype == BackQuoted) return u"`"_s;
+	if (ptype == Comment || ptype == Delimiter) {
+		QString ret = u"\n"_s;
+		for (int j = 0; j < (level-1) * 4; ++j) ret += u" "_s;
+		return ret;
+	}
+	return QString();
+}
diff --git a/rkward/misc/rkparsedscript.h b/rkward/misc/rkparsedscript.h
new file mode 100644
index 000000000..f08b5aa36
--- /dev/null
+++ b/rkward/misc/rkparsedscript.h
@@ -0,0 +1,77 @@
+/*
+celleditor - This file is part of the RKWard project. Created: Sat May 17 2025
+SPDX-FileCopyrightText: 2025 by Thomas Friedrichsmeier <thomas.friedrichsmeier at kdemail.net>
+SPDX-FileContributor: The RKWard Team <rkward-devel at kde.org>
+SPDX-License-Identifier: GPL-2.0-or-later
+*/
+#ifndef RKPARSEDSCRIPT_H
+#define RKPARSEDSCRIPT_H
+
+#include <QString>
+
+#include <vector>
+
+#include "../debug.h"
+
+/** Very crude, but very fast R parser, with some helper functions for code navigation. Parses the basic structure, only
+
+Technical note on data structure: While, logically, contexts form a nested hierarchy, a nested data layout does not really lend itself
+to our purpose, which is to navigate the underlying code, sequentially. So rather, we keep a flat list of contexts, sorted (inherently, during parsing)
+by start position.
+
+Inside this flat list, a child context is defined by starting after (or at) the parent's start, and ending before (or at) the parent's end. Child
+contexts are always found after their parent in the list.
+
+Type of context. Parenthesis, Brace, and Bracket are the only ContextType s that we actually consider as nested.
+*/
+class RKParsedScript {
+  public:
+	enum ContextType {
+		None,
+		Top,
+		Parenthesis,
+		Brace,
+		Bracket,
+		Comment,
+		SingleQuoted,
+		DoubleQuoted,
+		BackQuoted,
+		SubsetOperator,
+		OtherOperator,
+		Delimiter,
+		AnySymbol
+	};
+
+	struct Context {
+		Context(ContextType type, int start) : type(type), start(start) {};
+		Context(ContextType type, int start, int end) : type(type), start(start), end(end) {};
+		ContextType type;
+		int start;
+		int end;
+	};
+
+	RKParsedScript(const QString &content);
+
+	/** Find the (index of the) innermost context containing pos.
+	 *  returns the previous context, if no context actually contains this position (e.g. on a space) */
+	int contextAtPos(int pos) const;
+	
+	const Context &getContext(int index) const {
+		return context_list.at(index);
+	}
+
+  private:
+	// add and parse a context. This is where the actual parsing takes place
+	int addContext(ContextType type, int start, const QString &content);
+
+friend class RKCodeNavigation;
+	// NOTE: used in debugging, only
+	QString serialize() const;
+	QString serializeContextEnd(const Context &ctx, int level) const;
+
+	// I want to modify some objects in place during parsing, without triggering copy-on-write
+	// hence no Qt container
+	std::vector<Context> context_list;
+};
+
+#endif
diff --git a/rkward/windows/rkcommandeditorwindow.cpp b/rkward/windows/rkcommandeditorwindow.cpp
index 0f2b68673..902b5666c 100644
--- a/rkward/windows/rkcommandeditorwindow.cpp
+++ b/rkward/windows/rkcommandeditorwindow.cpp
@@ -65,184 +65,6 @@ SPDX-License-Identifier: GPL-2.0-or-later
 #include "rktexthints.h"
 #include "rkworkplace.h"
 
-#include "../debug.h"
-
-/** Very crude, but very fast R parser. Parses the basic structure, only
-
-Technical note on data structure: While, logically, contexts form a nested hierarchy, a nested data layout does not really lend itself
-to our purpose, which is to navigate the underlying code, sequentially. So rather, we keep a flat list of contexts, sorted (inherently, during parsing)
-by start position.
-
-Inside this flat list, a child context is defined by starting after (or at) the parent's start, and ending before (or at) the parent's end. Child
-contexts are always found after their parent in the list.
-*/
-class RKParsedScript {
-public:
-	enum ContextType {
-		None,
-		Top,
-		Parenthesis,
-		Brace,
-		Bracket,
-		Comment, // 5
-		SingleQuoted,
-		DoubleQuoted,
-		BackQuoted,
-		SubsetOperator,
-		OtherOperator, // 10
-		Delimiter,
-		AnySymbol
-	};
-
-	struct Context {
-		Context(ContextType type, int start) : type(type), start(start) {};
-		Context(ContextType type, int start, int end) : type(type), start(start), end(end) {};
-		ContextType type;
-		int start;
-		int end;
-	};
-
-	std::vector<Context> context_list;
-
-	int contextAtPos(int pos) const {
-		// Context 0 is Top, not really of interest
-		for (int i = 1; i < context_list.size(); ++i) {
-			if (context_list.at(i).start > pos) {
-				return i - 1;
-			}
-		}
-		return 0;
-	}
-	
-	const Context &getContext(int index) const {
-		return context_list.at(index);
-	}
-
-	RKParsedScript(const QString &content) {
-		context_list.reserve(200); // just a very wild guess
-		addContext(Top, -1, content);
-	};
-
-	int addContext(ContextType type, int start, const QString &content) {
-		ContextType prevtype = context_list.empty() ? None : context_list.back().type;
-
-		int index = context_list.size();
-		// some contexts need (or benefit from) special handling depending on the preceding context
-		if (type == OtherOperator && prevtype == OtherOperator) {
-			// Merge any two subsequent operators into one token
-			// i.e. do not add a context, we'll reuse the previous one.
-			--index;
-		} else if (type == Delimiter && content.at(start) == u'\n' && (prevtype == OtherOperator || prevtype == SubsetOperator)) {
-			// newlines do not count as delimiter on operator RHS, so skip ahead, instead of really adding this
-			return start;
-		} else {
-			context_list.emplace_back(type, start); // end will be filled in, later
-		}
-
-		int pos = start;
-		if (type == SingleQuoted || type == DoubleQuoted || type == BackQuoted) {
-			while (++pos < content.length()) {
-				const QChar c = content.at(pos);
-				if (c == u'\\') ++pos;
-				else if (c == u'\'' && type == SingleQuoted) break;
-				else if (c == u'"' && type == DoubleQuoted) break;
-				else if (c == u'`' && type == BackQuoted) break;
-			}
-		} else if (type == AnySymbol) {
-			while (++pos < content.length()) {
-				const QChar c = content.at(pos);
-				if (!c.isLetterOrNumber() && c != u'.') {
-					--pos;
-					break;
-				}
-			}
-		} else if (type == Comment) {
-			while (++pos < content.length()) {
-				if (content.at(pos) == u'\n') break;
-			}
-		} else if (type == OtherOperator || type == SubsetOperator || type == Delimiter) {
-			// leave context, immediately
-		} else {
-			while (++pos < content.length()) {
-				QChar c = content.at(pos);
-				if (c == u'\'') pos = addContext(SingleQuoted, pos, content);
-				else if (c == u'"') pos = addContext(DoubleQuoted, pos, content);
-				else if (c == u'`') pos = addContext(BackQuoted, pos, content);
-				else if (c == u'#') pos = addContext(Comment, pos, content);
-				else if (c == u'(') pos = addContext(Parenthesis, pos, content);
-				else if (c == u')' && type == Parenthesis) break;
-				else if (c == u'{') pos = addContext(Brace, pos, content);
-				else if (c == u'}' && type == Brace) break;
-				else if (c == u'[') pos = addContext(Bracket, pos, content);
-				else if (c == u']' && type == Bracket) break;
-				else if (c.isLetterOrNumber() || c == u'.') pos = addContext(AnySymbol, pos, content);
-				else if (c == u'\n' || c == u',' || c == u';') pos = addContext(Delimiter, pos, content);
-				else if (c == u'$' || c == u'@') pos = addContext(SubsetOperator, pos, content);
-				else if (!c.isSpace()) pos = addContext(OtherOperator, pos, content);
-			}
-		}
-
-		// NOTE: we can't just keep a reference to the context at the start of this function, as the vector
-		//       may re-allocate during nested parsing
-		context_list.at(index).end = pos;
-		return pos;
-	};
-
-	// NOTE: used in debugging, only
-	QString serialize() {
-		QString ret;
-		std::vector<Context> stack;
-		stack.push_back(Context(None, -1, INT_MAX)); // dummy context, to avoid empty stack
-
-		for (unsigned int i = 0; i < context_list.size(); ++i) {
-			const auto ctx = context_list.at(i);
-
-			// end any finished contexts
-			while (ctx.start >= stack.back().end) {
-				ret += endContext(stack.back(), stack.size());
-				stack.pop_back();
-			}
-
-			// now deal with the current context
-			stack.push_back(ctx);
-			const auto type = ctx.type;
-			if (type == Parenthesis) ret += u'(';
-			if (type == Brace) ret += u'{';
-			if (type == Bracket) ret += u'[';
-			if (type == SingleQuoted) ret += u'\'';
-			if (type == DoubleQuoted) ret += u'"';
-			if (type == BackQuoted) ret += u'`';
-			if (type == Comment) ret += u'#';
-			if (type == SubsetOperator) ret += u'$';
-			if (type == OtherOperator) ret += u'+';
-			if (type == AnySymbol) ret += u'x';
-		}
-		while (!stack.empty()) {
-			ret += endContext(stack.back(), stack.size());
-			stack.pop_back();
-		}
-
-		return ret;
-	}
-
-	QString endContext(const Context &ctx, int level) {
-		const auto ptype = ctx.type;
-
-		if (ptype == Parenthesis) return u")"_s;
-		if (ptype == Brace) return u"}"_s;
-		if (ptype == Bracket) return u"]"_s;
-		if (ptype == SingleQuoted) return u"'"_s;
-		if (ptype == DoubleQuoted) return u"\""_s;
-		if (ptype == BackQuoted) return u"`"_s;
-		if (ptype == Comment || ptype == Delimiter) {
-			QString ret = u"\n"_s;
-			for (int j = 0; j < (level-1) * 4; ++j) ret += u" "_s;
-			return ret;
-		}
-		return QString();
-	}
-};
-
 class RKCodeNavigation : public QWidget {
   private:
 	RKCodeNavigation(KTextEditor::View *view) : QWidget(view, Qt::Popup | Qt::FramelessWindowHint | Qt::BypassWindowManagerHint), view(view), doc(view->document()) {