Matrix Template Klasse als Beispiel für die Verwendung von Allokatoren

*john 0

Ich wollte die ganze Zeit schon das Beispiel posten, aber irgend wie wurde es zu umfangreich, und schlussendlich ist es immer noch nicht fertig, und die Spezialisierung für die Verwendung einer optimierten BLAS/LAPACK fehlt halt auch noch etc. Diesen Header als Modul mit GCC 11.3.0, 12.x oder clang-++14 zu übersetzen scheitert bei simplen Testprogrammen schon und irgend ein interner Compilerfehler tritt auf. Daher ist das ganze noch nicht als Modul umgesetzt.

Konstruktive Kritik ist immer gern gelesen.

#ifndef MATRIX_H_
#define MATRIX_H_

#include <memory>
#include <utility>
#include <concepts>
#include <iostream>
#include <stdexcept>
#include <functional>
#include <type_traits>
#include <initializer_list>

namespace linear_algebra {
	using std::enable_if_t, std::is_signed_v, std::is_unsigned_v, std::is_same_v, std::same_as, std::swap, std::move, std::max;

	class Transposed {
	public:
		using transposed = std::true_type;
	};

	class TransposedNo {
	public:
		using transposed = std::false_type;
	};

	template <typename T>
	concept CheckTransposed = std::same_as<T,Transposed>;

	template <typename T>
	concept CheckTransposedNo = std::same_as<T,TransposedNo>;

	template <typename T>
	concept CheckTransposition = std::same_as<T,Transposed> || std::same_as<T,TransposedNo>;

	template <typename T>
	concept Addition = requires (T a, T b) {a + b;};

	template <typename T>
	concept Substraction = requires (T a, T b) {a - b;};

	template <typename T>
	concept Multiplication = requires (T a, T b) {a * b;};

	template <typename T>
	concept Division = requires (T a, T b) {a / b;};

	template <typename T>
	concept Field = Addition<T> && Substraction<T> && Multiplication<T> && Division<T>
	  && std::regular<T>;

	template<typename IndexType>
	class Bounds {
	public:
		Bounds () noexcept;
		Bounds (const IndexType upper);
		explicit Bounds (const IndexType lower, const IndexType upper);
		consteval Bounds (const std::initializer_list<const IndexType>&);
		constexpr void null() noexcept;
	};

	template <typename IndexType>
	requires std::signed_integral<IndexType> && (!std::same_as<IndexType,bool>)
	class Bounds<IndexType> {
	public:
		IndexType lower_, upper_;

		Bounds () noexcept : lower_(0), upper_(0) {}
		Bounds (const IndexType upper) : lower_(0), upper_(upper) {
			if (upper < 0) throw std::logic_error ("upper bound must be larger than zero");
		}
		explicit Bounds (const IndexType lower, const IndexType upper) : lower_(lower), upper_(upper) {
			if (lower > upper) throw std::logic_error ("lower bound must be smaller or equal to upper bound");
		}
		consteval Bounds (const std::initializer_list<const IndexType>& list) {
			if (2 != list.size()) throw std::logic_error ("Bounds<IndexType=signed_integral> needs two Elements");

			auto it = list.begin();
			lower_ = *it;
			upper_ = *(++it);
		}

		constexpr void null () noexcept {
			lower_ = 0;
			upper_ = 0;
		}
	};

	template <typename IndexType>
	requires std::unsigned_integral<IndexType> && (!std::same_as<IndexType,bool>)
	class Bounds<IndexType> {
	public:
		static constexpr IndexType lower_ = 0;
		IndexType upper_;

		Bounds () noexcept : upper_(0) {}
		Bounds (const IndexType upper) : upper_(upper) {}
		explicit Bounds (const IndexType lower [[maybe_unused]], const IndexType upper) : upper_(upper) {}
		consteval Bounds (const std::initializer_list<const IndexType>& list) {
			if (1 != list.size()) throw std::logic_error ("Bounds<IndexType=unsigned_integral> needs one Element exactly");

			auto it = list.begin();
			upper_ = *it;
		}
		constexpr void null() noexcept {
			upper_ = 0;
		}
	};

	template <typename IndexType>
	Bounds(const IndexType) -> Bounds<IndexType>;

	template<typename IndexType>
	std::ostream&
	operator<< (std::ostream& o, const Bounds<IndexType>& bounds) {
		o << "[" << bounds.lower_ << ", " << bounds.upper_ << "[";

		return o;
	}

	/// Fortran style arrays
	template<typename SizeType, typename IndexType, typename Transposition = TransposedNo>
	class ColumnMajorOrder;

	template <typename SizeType, typename IndexType, typename Transposition>
	requires std::integral<SizeType> && std::integral<IndexType> && CheckTransposedNo<Transposition> && (! std::same_as<SizeType, bool>) && (! std::same_as<IndexType, bool>)
	class ColumnMajorOrder<SizeType, IndexType, Transposition> {
	public:
		static inline constexpr SizeType indx (const Bounds<IndexType>& m, const Bounds<IndexType>& n, const IndexType i, const IndexType j, const SizeType ld)	noexcept {
			return ((i - m.lower_) + ld * (j - n.lower_));
		}
		static inline constexpr SizeType size (const Bounds<IndexType>& m [[maybe_unused]], const Bounds<IndexType>& n, const SizeType ld) noexcept {
			return ld * (n.upper_ - n.lower_);
		}
		static inline constexpr SizeType calc_ldim (const Bounds<IndexType>& m [[maybe_unused]], const Bounds<IndexType>& n)	noexcept {
			return (n.upper_ - n.lower_);
		}
		template <typename T, typename TranspositionSource, template<typename,typename,typename> class OrderSource>
		static inline constexpr void assign (const Bounds<IndexType>& m, const Bounds<IndexType>& n, const SizeType ld, const T* const source, T* const target) {
			for (IndexType j = n.lower_; j != n.upper_; ++j) {
				for (IndexType i = m.lower_; i != m.upper_; ++i) {
					target[indx(m,n,i,j,ld)] = source[OrderSource<SizeType,IndexType,TranspositionSource>::indx(m,n,i,j,ld)];
				}
			}
		}	
	};

	template <typename SizeType, typename IndexType, typename Transposition>
	requires std::integral<SizeType> && std::integral<IndexType> && CheckTransposed<Transposition> && (! std::same_as<SizeType, bool>) && (! std::same_as<IndexType, bool>)
	class ColumnMajorOrder<SizeType, IndexType, Transposition> {
	public:
		static inline constexpr SizeType indx (const Bounds<IndexType>& m, const Bounds<IndexType>& n, const IndexType i, const IndexType j, const SizeType ld)	noexcept {
			return ((j - n.lower_) + ld * (i - m.lower_));
		}
		static inline constexpr SizeType size (const Bounds<IndexType>& m, const Bounds<IndexType>& n [[maybe_unused]], const SizeType ld) noexcept {
			return ld * (m.upper_ - m.lower_);
		}
		static inline constexpr SizeType calc_ldim (const Bounds<IndexType>& m, const Bounds<IndexType>& n [[maybe_unused]]) noexcept {
			return (m.upper_ - m.lower_);
		}
		template <typename T, typename TranspositionSource, template<typename,typename,typename> class OrderSource>
		static inline constexpr void assign (const Bounds<IndexType>& m, const Bounds<IndexType>& n, const SizeType ld, const T* const source, T* const target) {
			for (IndexType i = m.lower_; i != m.upper_; ++i) {
				for (IndexType j = n.lower_; j != n.upper_; ++j) {
					target[indx(m,n,i,j,ld)] = source[OrderSource<SizeType,IndexType,TranspositionSource>::indx(m,n,i,j,ld)];
				}
			}
		}
	};

	/// C/C++ style arrays
	template <typename size_type, typename index_type, typename Transposition = TransposedNo>
	class RowMajorOrder;

	template <typename SizeType, typename IndexType, typename Transposition>
	requires std::integral<SizeType> && std::integral<IndexType> && CheckTransposedNo<Transposition> && (! std::same_as<SizeType, bool>) && (! std::same_as<IndexType, bool>)
	class RowMajorOrder <SizeType, IndexType, Transposition> {
	public:
		static inline constexpr SizeType indx (const Bounds<IndexType>& m, const Bounds<IndexType>& n, const IndexType i, const IndexType j, const SizeType ld) noexcept {
			return (ld * SizeType(i - m.lower_) + SizeType(j - n.lower_));
		}
		static inline constexpr SizeType size (const Bounds<IndexType>& m, const Bounds<IndexType>& n [[maybe_unused]], const SizeType ld) noexcept {
			return ld * SizeType(m.upper_ - m.lower_);
		}
		static inline constexpr SizeType calc_ldim (const Bounds<IndexType>& m, const Bounds<IndexType>& n [[maybe_unused]]) noexcept {
			return SizeType(m.upper_ - m.lower_);
		}
		template <typename T, typename TranspositionSource, template<typename,typename,typename> class OrderSource>
		static inline constexpr void assign (const Bounds<IndexType>& m, const Bounds<IndexType>& n, const SizeType ld, const T* const source, T* const target) {
			for (IndexType i = m.lower_; i != m.upper_; ++i) {
				for (IndexType j = n.lower_; j != n.upper_; ++j) {
					target[indx(m,n,i,j,ld)] = source[OrderSource<SizeType,IndexType,TranspositionSource>::indx(m,n,i,j,ld)];
				}
			}
		}
	};

	template <typename SizeType, typename IndexType, typename Transposition>
	requires std::integral<SizeType> && std::integral<IndexType> && CheckTransposed<Transposition> && (! std::same_as<SizeType, bool>) && (! std::same_as<IndexType, bool>)
	class RowMajorOrder <SizeType, IndexType, Transposition> {
	public:
		static inline constexpr SizeType indx (const Bounds<IndexType>& m, const Bounds<IndexType>& n, const IndexType i, const IndexType j, const SizeType ld) noexcept {
				return (ld * (j - n.lower_) + (i - m.lower_));
		}
		static inline constexpr SizeType size (const Bounds<IndexType>& m [[maybe_unused]], const Bounds<IndexType>& n, const SizeType ld) noexcept {
			return ld * (n.upper_ - n.lower);
		}
		static inline constexpr SizeType calc_ldim (const Bounds<IndexType>& m [[maybe_unused]], const Bounds<IndexType>& n) noexcept {
			return (n.upper_ - n.lower_);
		}
		template <typename T, typename TranspositionSource, template<typename,typename,typename> class OrderSource>
		static inline constexpr void assign (const Bounds<IndexType>& m, const Bounds<IndexType>& n, const SizeType ld, const T* const source, T* const target) {
			for (IndexType i = m.lower_; i != m.upper_; ++i) {
				for (IndexType j = n.lower_; j != n.upper_; ++j) {
					target[indx(m,n,i,j,ld)] = source[OrderSource<SizeType,IndexType,TranspositionSource>::indx(m,n,i,j,ld)];
				}
			}
		}
	};


	template<typename SizeType, typename IndexType>
	class BoundsCheck;

	template <typename SizeType, typename IndexType>
	requires std::integral<SizeType> && std::signed_integral<IndexType> && (! std::same_as<SizeType, bool>)
	class BoundsCheck<SizeType, IndexType> {
	public:
		static inline constexpr void indx (const Bounds<IndexType>& m, const Bounds<IndexType>& n, const IndexType i, const IndexType j) {
			if ((i >= m.upper_) || (i < m.lower_) || (j >= n.upper_) || (j < n.lower_) || (m.upper_ == m.lower_) || (n.upper_ == n.lower_)) {
				std::cerr << "\n(ml:mu),(nl:nu);(i,j) (" << m.lower_ <<":" << m.upper_ << "),(" << n.lower_ << ":" << n.upper_ << ");(" << i << "," << j << ")" << std::endl;
				throw std::runtime_error ("Out of bounds");
			}
		}
		static inline constexpr void calc_ldim (const SizeType ld, const SizeType calculated_ld) {
			if (ld < calculated_ld) throw std::runtime_error("Leading Dimension must be greater or equal than the corresponding matrix dimension");
		}
		static inline constexpr void mult (const Bounds<IndexType>& m, const Bounds<IndexType>& n) {
			if ((m.upper_-m.lower_) != (n.upper_-n.lower_)) throw std::runtime_error ("matmul dimension mismatch");
		}
		static inline constexpr void sgoe (const SizeType m, const SizeType n) {
			if (m < n) throw std::runtime_error ("size of target matrix ist too small");
		}
		static inline constexpr void check_size (const Bounds<IndexType>& m1, const Bounds<IndexType>& n1,
		  const Bounds<IndexType>& m2, const Bounds<IndexType>& n2)
		{
			if ((m1.upper_ != m2.upper_) || (m1.lower_ != m2.lower_) || (n1.upper_ != n2.upper_) || (n1.lower_ != n2.lower_))
				throw std::runtime_error ("sizes mismatch");
		}
	};

	template <typename SizeType, typename IndexType>
	requires std::integral<SizeType> && std::unsigned_integral<IndexType> && (! std::same_as<SizeType, bool>) && (! std::same_as<IndexType, bool>)
	class BoundsCheck<SizeType, IndexType> {
	public:
		static inline constexpr void indx (const Bounds<IndexType>& m, const Bounds<IndexType>& n, const IndexType i, const IndexType j) {
			if ((m.upper_ == 0) || (n.upper_ == 0) || (i > m.upper_) || (j > n.upper_)) {
				std::cerr << "m,n,i,j" << i << "," << j << std::endl;
				throw std::runtime_error("Out of bounds");
			}
		}
		static inline constexpr void calc_ldim (const SizeType ld, const SizeType calculated_ld) {
			if (ld < calculated_ld) throw std::runtime_error("Leading Dimension must be greater or equal than the corresponding matrix dimension");
		}
		static inline constexpr void mult (const Bounds<IndexType>& m, const Bounds<IndexType>& n) {
			if (m.u != n.u) throw std::runtime_error ("matmul dimension mismatch");
		}
		static inline constexpr void sgoe (const SizeType m, const SizeType n) {
			if (m < n) throw std::runtime_error ("size of target matrix ist too small");
		}
		static inline constexpr void check_size (const Bounds<IndexType>& m1, const Bounds<IndexType>& n1,
		  const Bounds<IndexType>& m2, const Bounds<IndexType>& n2)
		{
			if (m1.u != m2.u || n1.u != n2.u) throw std::runtime_error ("sizes mismatch");
		}
	};

	template <typename SizeType, typename IndexType>
	requires std::integral<SizeType> && std::integral<IndexType> && (! std::same_as<SizeType, bool>) && (! std::same_as<IndexType, bool>)
	class BoundsCheckNo {
	public:
		static inline constexpr void indx (const Bounds<IndexType>& m [[maybe_unused]], const Bounds<IndexType>& n [[maybe_unused]],
		  const IndexType i [[maybe_unused]], const IndexType j [[maybe_unused]]) noexcept
		{}
		static inline constexpr void calc_ldim (const SizeType ld [[maybe_unused]], const SizeType calculated_ld [[maybe_unused]]) noexcept {}
		static inline constexpr void mult (const Bounds<IndexType>& m [[maybe_unused]], const Bounds<IndexType>& n [[maybe_unused]]) noexcept {}
		static inline constexpr void sgoe (const SizeType m [[maybe_unused]], const SizeType n [[maybe_unused]]) noexcept {}
		static inline constexpr void check_size (const Bounds<IndexType>& m1 [[maybe_unused]], const Bounds<IndexType>& n1 [[maybe_unused]],
		  const Bounds<IndexType>& m2 [[maybe_unused]], const Bounds<IndexType>& n2 [[maybe_unused]]) noexcept
		{}
	};

	template <typename SizeType, typename IndexType = SizeType>
	requires std::integral<SizeType> && std::integral<IndexType> && (! std::same_as<SizeType, bool>) && (! std::same_as<IndexType, bool>)
	class MatrixDimension {
	public:
		Bounds<IndexType> m_, n_;
		SizeType ld_, size_, capa_;

		MatrixDimension () = default;
		MatrixDimension (const Bounds<IndexType> m, const Bounds<IndexType> n, const SizeType ld, const SizeType size, const SizeType capa)
		  : m_(m), n_(n), ld_(ld), size_(size), capa_(capa)
		{}
		consteval MatrixDimension (
			  const std::initializer_list<const IndexType>& mlist,
			  const std::initializer_list<const IndexType>& nlist,
			  const SizeType ld, const SizeType size, const SizeType capa
			) : m_({}), n_({}), ld_(ld), size_(size), capa_(capa)
		{
			auto ms = mlist.size();
			auto ns = nlist.size();

			if (0 == ms && 0 == ns) {
				m_ = Bounds<IndexType> (0, 0);
				n_ = Bounds<IndexType> (0, 0);
			} else {
				if (2 != ms || 2 != ns) throw std::logic_error ("initializer_list for m and n must contain each two values");

				auto it = mlist.begin();
				IndexType m0 = *it;
				IndexType m1 = *(++it);

				it = nlist.begin();
				IndexType n0 = *it;
				IndexType n1 = *(++it);

				m_ = Bounds<IndexType>(m0, m1);
				n_ = Bounds<IndexType>(n0, n1);
			}
		}

		constexpr void null () noexcept {
			m_.null();
			n_.null();

			ld_   = 0;
			size_ = 0;
			capa_ = 0;
		}

		template <typename ST, typename IT>
		friend void swap(MatrixDimension<ST,IT>&, MatrixDimension<ST,IT>&) noexcept;
	};

	template <typename SizeType, typename IndexType>
	void swap (MatrixDimension<SizeType, IndexType>& lhs, MatrixDimension<SizeType, IndexType>& rhs) noexcept {
		swap (lhs.m_,    rhs.m_);
		swap (lhs.n_,    rhs.n_);
		swap (lhs.ld_,   rhs.ld_);
		swap (lhs.size_, rhs.size_);
		swap (lhs.capa_, rhs.capa_);
	}

	/// YAMC yet another matrix class. The motivation behind this class is the problem that most matrix classes for C++
	/// does not use high efficient implementation of basic linear algebra algorithm such as BLAS or LAPACK.
	/// The interoperation with such HPC libraries requires a Fortran compatible layout of the matrix. Another
	/// problem with tradtional C++ matrix classes is, they are not designed to work with NUMA memory layouts.
	/// This class was designed as a test class for NUMA aware allocator. Currently this class only provides memory allocation
	/// and a operator() to access the stored scalars. Further it uses some helper classes, which provides C++ and Fortran
	/// storage layout, and a simple bounds checking class.

	template<
	  class Transposition = TransposedNo, typename T = double, typename IndexType = size_t,
	  template<typename = T> typename Allocator = std::allocator,
	  template<typename = typename std::allocator_traits<Allocator<T>>::size_type, typename = IndexType, typename = Transposition> typename Order = RowMajorOrder, 
	  template<typename = typename std::allocator_traits<Allocator<T>>::size_type, typename = IndexType> class BoundsChecker = BoundsCheckNo
	>
	requires Field<T> && std::integral<IndexType> && (! std::same_as<IndexType, bool>) && CheckTransposition<Transposition>
	class Matrix {
	public:
		using value_type      = T;
		using allocator_type  = Allocator<T>;
		using alloc_traits	  = std::allocator_traits<allocator_type>;

		using pointer         = typename alloc_traits::pointer;
		using const_pointer   = typename alloc_traits::const_pointer;
		using size_type       = typename alloc_traits::size_type;
		using difference_type = typename alloc_traits::difference_type;

		using reference       = value_type&;
		using const_reference = const value_type&;
		using OrderT = Order<size_type, IndexType, Transposition>;
	private:
		allocator_type allocator_;
		MatrixDimension<size_type, IndexType> mv_;
		pointer p_;

		void null() {
			this->destroy_elements();
			allocator_.deallocate(this->p_, this->mv_.capa_);
			this->mv_.null();
		}
		template<typename RIndexType, class RTransposition, template<typename> class RAllocator, template<typename, typename, typename> typename ROrder, template<typename, typename> class RBoundsChecker>
		void assign (const Matrix<RTransposition,T,RIndexType,RAllocator,ROrder,RBoundsChecker>& rhs) {
			BoundsChecker<size_type, IndexType>::sgoe(this->mv_.size_, rhs.mv_.size_);

			Order<size_type, IndexType, Transposition>::template assign<T,RTransposition,ROrder>(mv_.m_,mv_.n_,mv_.ld_,rhs.p_,this->p_);
		}
		void destroy_elements () {
			for (size_type i = 0; i < this->mv_.capa_; ++i) {
				alloc_traits::destroy(this->allocator_, &(this->p_[i]));
			}
		}
		void construct_elements () {
			for (size_type i = 0; i < this->mv_.capa_; ++i) {
				alloc_traits::construct(this->allocator_, &(this->p_[i]));
			}
		}
	public:
		/// Destroy all elements via std::allocator_traits<Allocator>::destroy and
		/// deallocate memory with the allocator Allocator.
		~Matrix() {
			if (p_) this->null();
		}

		allocator_type get_allocator() const {
			return allocator_;
		}

		bool empty() const {
			if (!p_ || 0 == mv_.size_) return true;

			return false;
		}

		// Constructors

		/// Default constructor needed because of container requirements.
		/// An empty matrix is more or less useless.
		Matrix () : allocator_({}), mv_(Bounds<IndexType>(0, 0), Bounds<IndexType>(0, 0), 0, 0, 0), p_(nullptr) {}
		explicit Matrix (allocator_type a) : allocator_(std::forward<allocator_type>(a)), mv_(Bounds<IndexType>(0, 0), Bounds<IndexType>(0, 0), 0, 0, 0), p_(nullptr) {}

		Matrix (const Bounds<IndexType> m, const Bounds<IndexType> n, const size_type ld, allocator_type a = {})
		  : allocator_(std::forward<allocator_type>(a)), mv_(m, n, ld, 0, 0), p_(nullptr)
		{
			mv_.size_ = OrderT::size(m, n, mv_.ld_);
			mv_.capa_ = mv_.size_;
			p_        = this->allocator_.allocate(this->mv_.capa_);
		}

		Matrix (const Bounds<IndexType> m, const Bounds<IndexType> n, allocator_type a = {}) : Matrix (m, n, OrderT::calc_ldim(m, n), a) {}
		Matrix (const IndexType mu, const IndexType nu, const size_type ld, allocator_type a = {}) : Matrix(Bounds<IndexType>(0, mu), Bounds<IndexType>(0, nu), ld, a) {}
		Matrix (const IndexType mu, const IndexType nu, allocator_type a = {}) : Matrix (mu, nu, OrderT::calc_ldim(mu, nu), a) {}

		Matrix (const Bounds<IndexType> m, const Bounds<IndexType> n, const std::initializer_list<const T>& list)
		  : allocator_({}), mv_(m, n, OrderT::calc_ldim(m,n), 0, 0), p_(nullptr)
		{
			std::cerr << "m = " << m << ", n = " << n << "\n";
			mv_.size_ = OrderT::size (m, n, mv_.ld_);
			mv_.capa_ = mv_.size_;

			if (mv_.size_ != list.size()) {
				std::cerr << "mv_.size_ = " << mv_.size_ << " list.size() = " << list.size() << std::endl;
				throw std::runtime_error ("initializer_list does not fit array size");
			}

			p_ = this->allocator_.allocate(this->mv_.capa_);

			size_type i = 0;
			for (auto elem: list) {
				p_[i++] = elem;
			} 
		}

		Matrix (const IndexType m, const IndexType n, const std::initializer_list<const T>& list)
		  : allocator_({}), mv_(m, n, OrderT::calc_ldim(m, n), 0, 0), p_(nullptr)
		{
			Matrix(Bounds<IndexType>(0, m), Bounds<IndexType>(0, n), list);
		}

		Matrix (const std::initializer_list<const std::initializer_list<const T>>& list) : allocator_ ({}), mv_(Bounds<IndexType>(0, 0), Bounds<IndexType>(0, 0), 0, 0, 0), p_(nullptr) {
			static_assert(
			  is_same_v<Order<size_type, IndexType, Transposition>, ColumnMajorOrder<size_type, IndexType, Transposition>>
			  || is_same_v<Order<size_type, IndexType, Transposition>, RowMajorOrder<size_type, IndexType, Transposition>>
			);

			if constexpr (is_same_v<Order<size_type, IndexType, Transposition>, ColumnMajorOrder<size_type, IndexType, Transposition>>) {
				// fortran
				auto columns = list.size();
				typename std::initializer_list<std::initializer_list<T>>::size_type rows = 0;

				for (auto column: list) {
					rows = max (rows, column.size());
				}

				mv_ = {Bounds<IndexType>(0, rows), Bounds<IndexType>(0, columns), columns, rows*columns, rows*columns};
			} else if constexpr (is_same_v<Order<size_type, IndexType, Transposition>, RowMajorOrder<size_type, IndexType, Transposition>>) {
				// c++ 
				auto rows = list.size();
				typename std::initializer_list<std::initializer_list<T>>::size_type columns = 0;

				for (auto row: list) {
					columns = max (columns, row.size());
				}

				mv_ = {Bounds<IndexType>(0, rows), Bounds<IndexType>(0, columns), rows, rows*columns, rows*columns};
			}

			p_ = this->allocator_.allocate(this->mv_.capa_);

			if constexpr (is_same_v<Order<size_type, IndexType, Transposition>, ColumnMajorOrder<size_type, IndexType, Transposition>>) {
				// fortran
				IndexType m = 0, n = 0;
				for (auto column: list) {
					m = 0;
					for (auto elem: column) {
						(*this)(m,n) = elem;
						++m;
					}
					++n;
				}
			} else if constexpr (is_same_v<Order<size_type, IndexType, Transposition>, RowMajorOrder<size_type, IndexType, Transposition>>) {
				// C++
				IndexType m = 0, n = 0;
				for (auto row : list) {
					n = 0;
					for (auto elem : row) {
						(*this)(m,n) = elem;
						++n;
					}
					++m;
				}
			}
		}

		Matrix(const std::initializer_list<IndexType>& lbl, const std::initializer_list<IndexType>& ubl, const std::initializer_list<T>& ILT) {
			Bounds<IndexType> lblist (lbl);
			Bounds<IndexType> ublist (ubl);
			Matrix (lblist, ublist, ILT);
		}

		// CopyConstructors
		Matrix (const Matrix& rhs, const allocator_type allocator) : allocator_(allocator), mv_(rhs.mv_), p_(this->allocator_.allocate(this->mv_.capa_)) {
			this->assign<IndexType,Transposition,Allocator,Order,BoundsChecker>(rhs);
		}
		Matrix (const Matrix& rhs) : Matrix (rhs, alloc_traits::select_on_container_copy_construction(rhs.allocator_))
		{}

		// MoveConstructors
		Matrix (Matrix&& rhs, const allocator_type& allocator)
		requires (alloc_traits::is_always_equal::value)
		  : allocator_(allocator), mv_(std::move(rhs.mv_)), p_(std::move(rhs.p_))
		{
			rhs.mv_.null();
			rhs.p_ = nullptr;
		}

		Matrix (Matrix&& rhs, const allocator_type& allocator)
		requires (std::is_empty_v<typename alloc_traits::is_always_equal>)
		  : allocator_(allocator), mv_(std::move(rhs.mv_), p_(nullptr))
		{
			if (rhs.allocator_ != allocator) {
				p_ = this->allocator_.allocate(this->mv_.capa_);
				this->assign(rhs);
			} else {
				p_ = std::move (rhs.p_);
			}
			rhs.mv_.null();
			rhs.p_ = nullptr;
		}

		Matrix (Matrix&& rhs) : Matrix (rhs, rhs.allocator_) {}

		// CopyAssignment
		Matrix& operator= (const Matrix& rhs)
		requires (alloc_traits::is_always_equal::value)
		{
			if (this == &rhs) return *this;

			if (this->mv_.capa < rhs.mv_.size) {
				this->null();
				this->mv_       = rhs.mv_;
				this->mv_.capa_ = rhs.mv_size;
				this->p_        = this->allocator_.allocate(this->mv_.capa_);
			} else {
				this->mv_       = rhs.mv_;
			}

			this->assign(rhs);

			return *this;
		}

		Matrix& operator= (const Matrix& rhs) {
			if constexpr (alloc_traits::propagate_on_container_copy_assignment::value) {
				if (this == &rhs) return *this;

				if ((this->allocator_ != rhs.allocator_) || (this->mv_.capa_ < rhs.mv_.size_)) {
					// first deallocate with old allocator
					this->null();
					// then allocate with new allocator
					this->allocator_ = rhs.allocator_;
					this->mv_        = rhs.mv_;
					this->mv_.capa_  = rhs.mv_.size_;
					this->p_         = this->allocator_.allocate(this->mv_.capa_);
				} else {
					this->mv_        = rhs.mv_;
				}

				this->assign(rhs);

				return *this;
			} else {
				if (this == &rhs) return *this;

				if (this->mv_.capa_ < rhs.mv_.size_) {
					std::cout << "Mat::op= C\n";
					// realloc required
					this->null();
					this->mv_       = rhs.mv_;
					this->mv_.capa_ = rhs.mv_.size_;
					this->p_        = this->allocator_.allocate(this->mv_.capa_);
				} else {
					std::cout << "Mat::op= D\n";
					this->mv_       = rhs.mv_;
				}

				this->assign(rhs);

				return *this;
			}
		}

		// CopyAssignment for same type but different other template parameters
		template<typename RTransposition, typename RIndexType, template<typename> class RAllocator, template<typename,typename,typename> class ROrder, template<typename,typename> class RBoundsChecker>
		Matrix& operator= (Matrix<RTransposition,T,RIndexType,RAllocator,ROrder,RBoundsChecker>& rhs) {
			if (this->mv_.capa_ < rhs.mv_.size_) {
				this->null();
				this->mv_       = rhs.mv_;
				this->mv_.capa_ = rhs.mv_.size_;
				this->p_        = this->allocator_.allocate(this->mv_.capa_);
			} else {
				this->mv_       = rhs.mv_;
			}

			this->assign(rhs);

			return *this;
		}

		// MoveAssignment
		template<typename RTransposition, typename RIndexType, template <typename> class RAllocator, template<typename,typename,typename> class ROrder, template <typename,typename> class RBoundsChecker>
		Matrix& operator= (Matrix<RTransposition,T,RIndexType,RAllocator,ROrder,RBoundsChecker>&& rhs) {
			if (this == &rhs) return *this;

			if constexpr (alloc_traits::propagate_on_container_move_assignment::value) {
				this->null();
				this->allocator_ = rhs.allocator_;
				this->mv_        = std::move (rhs.mv_);
				this->p_         = std::move (rhs.p_);
			} else if (this->mv_.capa_ < rhs.mv_.size_) {
				this->null();
				this->mv_        = std::move(rhs.mv_);
				this->mv_.capa_  = this->mv_.size_;
				this->p_         = this->allocator_.allocate(this->mv_.capa_);
				this->assign(rhs);
			} else {
				size_type capa   = this->mv_.capa_;
				this->mv_        = std::move (rhs.mv_);
				this->mv_.capa_  = capa;
				this->assign(rhs);
			}

			rhs.mv_.null();
			rhs.p_ = nullptr;

			return *this;
		}

		/*
		allocator_type get_allocator() const {
			return this->allocator_;
		}
		*/

		reference operator() (const IndexType i, const IndexType j) {
			BoundsChecker<size_type, IndexType>::indx(mv_.m_, mv_.n_, i, j);
			return(p_)[OrderT::indx(mv_.m_, mv_.n_, i, j, mv_.ld_)];
		}

		const_reference operator() (const IndexType i, const IndexType j) const {
			BoundsChecker<size_type,IndexType>::index(mv_.m_, mv_.n_, i, j);
			return (p_)[OrderT::indx(mv_.m_, mv_.n_, i, j, mv_.ld_)];
		}

		Matrix&
		operator+= (const Matrix& rhs) {
			BoundsChecker<size_type,IndexType>::check_size (this->mv_.m_, this->mv_.n_, rhs.mv_.m_, rhs.mv_.n_);

			for (typename Matrix::size_type i = 0; i < this->mv_.size_; ++i) {
				this->p_[i] += rhs.p_[i];
			}

			return *this;
		}

		Matrix&
		operator-= (const Matrix& rhs) {
			BoundsChecker<size_type,IndexType>::check_size (this->mv_.m_, this->mv_.n_, rhs.mv_.m_, rhs.mv_.n_);

			for (typename Matrix::size_type i = 0; i < this->mv_.size_; ++i) {
				this->p_[i] -= rhs.p_[i];
			}

			return *this;
		}

		Matrix&
		operator*= (const Matrix& rhs) {
			BoundsChecker<size_type,IndexType>::check_size (this->mv_.m_, this->mv_.n_, rhs.mv_.m_, rhs.mv_.n_);

			for (typename Matrix::size_type i = 0; i < this->mv_.size_; ++i) {
				this->p_[i] *= rhs.p_[i];
			}

			return *this;		
		}

		Matrix&
		operator/= (const Matrix& rhs) {
			BoundsChecker<size_type,IndexType>::check_size (this->mv_.m_, this->mv_.n_, rhs.mv_.m_, rhs.mv_.n_);

			for (typename Matrix::size_type i = 0; i < this->mv_.size_; ++i) {
				this->p_[i] /= rhs.p_[i];
			}

			return *this;
		}

		template<typename T1, typename T2, typename T3, template<typename> typename T4, template<typename,typename,typename> typename T5, template<typename,typename> typename T6>
		requires Field<T2> && std::integral<T3> && (! std::same_as<T3, bool>) && CheckTransposition<T1>
		friend class Matrix;

		template<typename T1, typename T2, typename T3, template<typename> typename T4, template<typename,typename,typename> typename T5, template<typename,typename> typename T6>
		requires Field<T2> && std::integral<T3> && (! std::same_as<T3, bool>) && CheckTransposition<T1>
		friend void swap (Matrix<T1,T2,T3,T4,T5,T6>&, Matrix<T1,T2,T3,T4,T5,T6>&);

		template<typename T1, typename T2, typename T3, template<typename> typename T4, template<typename,typename,typename> typename T5, template <typename,typename> typename T6>
		requires Field<T2> && std::integral<T3> && (! std::same_as<T3, bool>) && CheckTransposition<T1>
		friend Matrix<T1,T2,T3,T4,T5,T6>
		operator+ (const Matrix<T1,T2,T3,T4,T5,T6>&, const Matrix<T1,T2,T3,T4,T5,T6>&);

		template<typename T1, typename T2, typename T3, template<typename> typename T4, template<typename,typename,typename> typename T5, template <typename,typename> typename T6>
		requires Field<T2> && std::integral<T3> && (! std::same_as<T3, bool>) && CheckTransposition<T1>
		friend Matrix<T1,T2,T3,T4,T5,T6>
		operator- (const Matrix<T1,T2,T3,T4,T5,T6>&, const Matrix<T1,T2,T3,T4,T5,T6>&);

		template<typename T1, typename T2, typename T3, template<typename> typename T4, template<typename,typename,typename> class T5, template <typename,typename> class T6>
		requires Field<T2> && std::integral<T3> && (! std::same_as<T3, bool>) && CheckTransposition<T1>
		friend Matrix<T1,T2,T3,T4,T5,T6>
		operator* (const Matrix<T1,T2,T3,T4,T5,T6>& lhs, const Matrix<T1,T2,T3,T4,T5,T6>& rhs);

		template<typename T1, typename T2, typename T3, template<typename> typename T4, template<typename,typename,typename> typename T5, template <typename,typename> typename T6>
		requires Field<T2> && std::integral<T3> && (! std::same_as<T3, bool>) && CheckTransposition<T1>
		friend Matrix<T1,T2,T3,T4,T5,T6>
		operator/ (const Matrix<T1,T2,T3,T4,T5,T6>&, const Matrix<T1,T2,T3,T4,T5,T6>&);

		std::ostream& print (std::ostream& out) {
			out << "Matrix Dimension m: " << mv_.m_ << " n: " << mv_.n_ << "\n";
			for (IndexType i = mv_.m_.lower_; i < mv_.m_.upper_; ++i) {
				out << "[";
				for (IndexType j = mv_.n_.lower_; j < mv_.n_.upper_; ++j) {
					out << (*this)(i,j) << " ";
				}
				out << "]\n";
			}

			std::cout << "\n";

			size_type ld = mv_.ld_;
			size_type lines = mv_.size_ / ld;

			std::cout << "ld=" << ld << std::endl;

			for (size_type i = 0; i < lines; ++i) {
				out << "[";
				for (size_type j = 0; j < ld; ++j) {
					out << this->p_[(i*mv_.ld_)+j] << " ";
				}
				out << "]\n";
			}			

			std::cout << std::flush;

			return out;
		}
	};

	// deduction guides
	template<
	  class Transposition = TransposedNo, typename T = double, typename IndexType = size_t,
	  template<typename = T> typename Allocator = std::allocator,
	  template<typename = typename std::allocator_traits<Allocator<T>>::size_type, typename = IndexType, typename = Transposition> typename Order = RowMajorOrder, 
	  template<typename = typename std::allocator_traits<Allocator<T>>::size_type, typename = IndexType> class BoundsChecker = BoundsCheckNo
	>
	Matrix (std::initializer_list<T>) -> Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>;

	template <typename Transposition, typename T, typename IndexType, template<typename> typename Allocator, template<typename, typename, typename> typename Order, template<typename,typename> typename BoundsChecker>
	std::ostream& operator<< (std::ostream& out, Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>& M) {
		return M.print(out);
	}

	template <
	  typename Transposition, typename T, typename IndexType,
	  template<typename> typename Allocator,
	  template<typename, typename, typename> typename Order,
	  template<typename,typename> typename BoundsChecker
	>
	requires Field<T> && std::integral<IndexType> && (! std::same_as<IndexType, bool>) && CheckTransposition<Transposition>
	void swap (Matrix<Transposition, T, IndexType, Allocator, Order, BoundsChecker>& lhs,
		Matrix<Transposition, T, IndexType, Allocator, Order, BoundsChecker>& rhs)
	{
		if (std::allocator_traits<Allocator<T>>::propagate_on_container_swap::value) {
			std::cout << "swap POCS\n" << std::endl;
			swap (lhs.allocator_, rhs.allocator_);
			swap (lhs.mv_, rhs.mv_);
			swap (lhs.p_,  rhs.p_);
		} else {
			std::cout << "swap non POCS" << std::endl;
			using MatrixT = Matrix<Transposition, T, IndexType, Allocator, Order, BoundsChecker>;

			if (lhs.mv_.capa_ >= rhs.mv_.size_ || lhs.mv_.size_ <= rhs.mv_.capa_) {
				// no reallocation needed
				auto ms = max(lhs.mv_.size_, rhs.mv_.size_);
				MatrixT tma (lhs.allocator_);

				tma.mv_ = lhs.mv_;
				lhs.mv_ = rhs.mv_;
				lhs.mv_.capa_ = tma.mv_.capa_;

				auto rc = rhs.mv_.capa_;
				rhs.mv_ = tma.mv_;
				rhs.mv_.capa_ = rc;

				// no swap, we have to copy the elements
				for (typename MatrixT::size_type i = 0; i != ms; ++i) {
					T t = lhs.p_[i];
					lhs.p_[i] = rhs.p_[i];
					rhs.p_[i] = t;
				}
			} else if (lhs.mv_.capa_ < rhs.mv_.size_) {
				Matrix tma  = move(lhs);
				lhs = rhs;
				rhs = tma;
			} else if (lhs.mv_.size_ > rhs.mv_.capa_) {
				Matrix tma = move(rhs);
				rhs = lhs;
				rhs = tma;
			} else {
				// should not happen
				throw std::logic_error ("case should never happen");
			}
		}
	}


	template <typename Transposition, typename T, typename IndexType, template<typename>typename Allocator, template<typename, typename, typename> typename Order, template <typename,typename> typename BoundsChecker>
	requires Field<T> && std::integral<IndexType> && (! std::same_as<IndexType, bool>) && CheckTransposition<Transposition>
	Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>
	operator+ (const Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>& lhs, const Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>& rhs)
	{
		using MatrixT = Matrix<Transposition, T, IndexType, Allocator, Order, BoundsChecker>;
		BoundsChecker<typename std::allocator_traits<Allocator<T>>::size_type,IndexType>::check_size (lhs.mv_.m_, lhs.mv_.n_, rhs.mv_.m_, rhs.mv_.n_);

		MatrixT r (lhs);
		for (typename MatrixT::size_type i = 0; i < lhs.mv_.size_; ++i) {
			r.p_[i] += rhs.p_[i];
		}

		return r;
	}

	template <typename Transposition, typename T, typename IndexType, template<typename> typename Allocator, template<typename, typename, typename> typename Order, template <typename,typename> typename BoundsChecker>
	requires Field<T> && std::integral<IndexType> && (! std::same_as<IndexType, bool>) && CheckTransposition<Transposition>
	Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>
	operator- (const Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>& lhs, const Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>& rhs)
	{
		using MatrixT = Matrix<Transposition, T, IndexType, Allocator, Order, BoundsChecker>;
		BoundsChecker<typename std::allocator_traits<Allocator<T>>::size_type,IndexType>::check_size (lhs.mv_.m_, lhs.mv_.n_, rhs.mv_.m_, rhs.mv_.n_);

		MatrixT r (lhs);
		for (typename MatrixT::size_type i = 0; i < lhs.mv_.size_; ++i) {
			r.p_[i] -= rhs.p_[i];
		}

		return r;
	}

	template <typename Transposition, typename T, typename IndexType, template<typename> typename Allocator, template<typename, typename, typename> typename Order, template <typename,typename> typename BoundsChecker>
	requires Field<T> && std::integral<IndexType> && (! std::same_as<IndexType, bool>) && CheckTransposition<Transposition>
	Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>
	operator* (const Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>& lhs, const Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>& rhs)
	{
		using MatrixT = Matrix<Transposition, T, IndexType, Allocator, Order, BoundsChecker>;
		BoundsChecker<typename std::allocator_traits<Allocator<T>>::size_type,IndexType>::check_size (lhs.mv_.m_, lhs.mv_.n_, rhs.mv_.m_, rhs.mv_.n_);

		MatrixT r (lhs);
		for (typename MatrixT::size_type i = 0; i < lhs.mv_.size_; ++i) {
			r.p_[i] *= rhs.p_[i];
		}

		return r;
	}

	template <typename Transposition, typename T, typename IndexType, template<typename> typename Allocator, template<typename, typename, typename> typename Order, template <typename,typename> typename BoundsChecker>
	requires Field<T> && std::integral<IndexType> && (! std::same_as<IndexType, bool>) && CheckTransposition<Transposition>
	Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>
	operator/ (const Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>& lhs, const Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>& rhs)
	{
		using MatrixT = Matrix<Transposition, T, IndexType, Allocator, Order, BoundsChecker>;
		BoundsChecker<typename std::allocator_traits<Allocator<T>>::size_type,IndexType>::check_size (lhs.mv_.m_, lhs.mv_.n_, rhs.mv_.m_, rhs.mv_.n_);

		MatrixT r (lhs);
		for (typename MatrixT::size_type i = 0; i < lhs.mv_.size_; ++i) {
			r.p_[i] /= rhs.p_[i];
		}

		return r;
	}

	template <typename Transposition, typename T, typename IndexType, template<typename> typename Allocator, template<typename,typename,typename> typename Order, template <typename,typename> typename BoundsChecker>
	void GEMM (const Matrix <Transposition,T,IndexType,Allocator,Order,BoundsChecker>& A,
		const Matrix <Transposition,T,IndexType,Allocator,Order,BoundsChecker>& B,
		Matrix<Transposition,T,IndexType,Allocator,Order,BoundsChecker>& C,
		const T alpha, const T beta
	)
	{
		// dummy code
		A = B+C;
		auto r = alpha * beta;

	}

	// GEPP
	// GEMP
	// GEPM
	// GEBP
	// GEPB
	// GEPDOT
	// GEBB

} // end namespace
#endif // MATRIX_H_

Finnegan

@john-0 Danke. Da sieht man mal, dass es immer wieder eine Menge Holz ist, wenn man das "ordentlich" machen will

Ich habs nur überflogen, aber sind die Anordnung der Elemente (Row/Colum Major) und transponiert/nicht transponiert separate Eigenschaften der Matrix? Ich frage mich gerade, ob sich da nicht eventuell Code sparen könnte, wenn man beide unter der Haube als äquivalent betrachtet und diese Eigenschaften nur im Interface exponiert - also z.B. "eine Row Major Matrix ist dasselbe wie eine transponierte Column Major Matrix". Oder kann man das nicht so einfach sagen in diesem (NUMA-aware) Kontext?

*john 0

Im Prinzip könnte man das so machen, wenn da das Thema Leading Dimension nicht wäre. Die Leading Dimension, wird dann interessant, wenn man anfängt ein Submatrix zu betrachten, da man diese mit Hilfe der Leading Dimension nicht kopieren muss. Nimmt man als Beispiel an, dass die eigentliche Matrix 1000×1000 groß ist, und man will 100×100 Matrizen bearbeiten, kann man Teilmatrizen ohne Kopieren bearbeiten, wenn man den Pointer auf das korrekte Element der Submtraix setzt und die Leading Dimension setzt.

Bisher fehlt das halt im Code komplett.

Finnegan

@john-0 sagte in Matrix Template Klasse als Beispiel für die Verwendung von Allokatoren:

Im Prinzip könnte man das so machen, wenn da das Thema Leading Dimension nicht wäre. Die Leading Dimension, wird dann interessant, wenn man anfängt ein Submatrix zu betrachten, da man diese mit Hilfe der Leading Dimension nicht kopieren muss. Nimmt man als Beispiel an, dass die eigentliche Matrix 1000×1000 groß ist, und man will 100×100 Matrizen bearbeiten, kann man Teilmatrizen ohne Kopieren bearbeiten, wenn man den Pointer auf das korrekte Element der Submtraix setzt und die Leading Dimension setzt.

Bisher fehlt das halt im Code komplett.

Ein paar Gedanken dazu:

Das mit dem "nicht kopieren" ist auch eine Motivation, eine transponierte Matrix einfach als eine der anderen Elementorndung zu betrachten. Transponieren ist dann nämlich ebenfalls erstmal eine No-Op, die Element-Daten werden einfach in eine Matrix der jeweils anderen Ordnung gemoved, ohne sie (obendrein auch noch cache-ungünstig) kopieren zu müssen.

Dazu braucht man allerdings auch Matrix-Rechenoperationen, die mit gemischten Odnungen umgehen können. Das eigentliche "Arbeit" des Transponierens würde dann z.B. in einer folgenden Matrix-Addition stattfinden, die bei gemischten Ordnungen eine der Matrizen anders durchlaufen müsste. Bei der Matrix-Multiplikation könnten entgegengesetzte Ordnungen sogar förderlich sein, da man beide Matrizen in der Speicher-Reihenfolge durchlaufen könnte, anstatt eine Zeilenweise und die andere Spaltenweise, was - zumindest bei naiver Implementierung - nicht so günstig für den CPU-Cache ist.

Ich habe das mit den unterschiedlichen Ordnungen mal sehr generisch gelöst, indem ich den Matrizen row_stride und col_stride Integer-Attribute gegeben habe. Das sind die Anzahl der Elemente, die man überspringen muss, um in die jeweils nächste Zeile oder Spalte zu kommen. Für eine $m \times n$ Matrix ist bei Row-Major-Ordnung row_stride = $n$ und col_stride = 1 und bei Colum-Major-Anordung row_stride = 1 und col_stride = $m$ . Damit wird die Index-Berechnung zu col_stride * j + row_stride * i, egal in welcher Ordung die Matrix vorliegt - so kann man an einigen Stellen auch auf einen Branch zur Fallunterscheidung zwischen Row/Column Major verzichten. Auch eventuelles Padding, z.B. so dass Zeilen/Spalten im Speicher immer ein vielfaches der Breite von SIMD-Datentypen sind, lässt sich damit gut handhaben.

Submatrizen ähnlich zu std::span übernehmen dann row_stride und col_stride von der Matrix, auf der sie verweisen und speichern einen (Integer-) Index auf das erste Element der Submatrix. Auch wenn die Submatrix andere Dimensionen als die referenzierte Matrix hat, so bringen einen das row_stride der Elternmatrix immer noch in die nächste Zeile und col_stride in die nächste Spalte.

Natürlich ist das nur einer von vielen gangbaren Ansätzen das Problem anzugehen, ich fand den jedoch relativ hilfreich - vor allem um den Code zu reduzieren.

*john 0

@Finnegan sagte in Matrix Template Klasse als Beispiel für die Verwendung von Allokatoren:

Das mit dem "nicht kopieren" ist auch eine Motivation, eine transponierte Matrix einfach als eine der anderen Elementorndung zu betrachten. Transponieren ist dann nämlich ebenfalls erstmal eine No-Op, die Element-Daten werden einfach in eine Matrix der jeweils anderen Ordnung gemoved, ohne sie (obendrein auch noch cache-ungünstig) kopieren zu müssen.

Ehrlich gesagt hatte ich das Thema noch nicht vertiefend betrachtet, da man in der Realität das nicht so häufig braucht, und ganz andere Optimierungsstrategien beim HPC eine Rolle spielen. Die trivial Operationen wie +, -, und Hadamard-Multiplikation kann man zwar für diese Fälle optimieren. Allerdings sind das Operationen, die im großen ganzen nur vom Speicherdurchsatz abhängen, und man nutzt in der Regel die gleichen Matrizen. Das betrifft sowohl die Speicherorganisation wie auch die eigentlichen Variablen in Programmcode.

Dazu braucht man allerdings auch Matrix-Rechenoperationen, die mit gemischten Odnungen umgehen können. Das eigentliche "Arbeit" des Transponierens würde dann z.B. in einer folgenden Matrix-Addition stattfinden, die bei gemischten Ordnungen eine der Matrizen anders durchlaufen müsste. Bei der Matrix-Multiplikation könnten entgegengesetzte Ordnungen sogar förderlich sein, da man beide Matrizen in der Speicher-Reihenfolge durchlaufen könnte, anstatt eine Zeilenweise und die andere Spaltenweise, was - zumindest bei naiver Implementierung - nicht so günstig für den CPU-Cache ist.

Der triviale Ansatz ist in der Realität eher unwichtig, da der Geschwindigkeitsunterschied so groß ist, dass man mit Sicherheit in real existierendem Code eine der linearen Algebra Bibliotheken nutzen wird. Als Standard hat sich dabei die BLAS und LAPACK Bibliothek etabliert. Es gibt eine Implementation GotoBLAS, die auf den Autoren Kazushige Goto zurückgeht. Er hat einen Artikel in der ACM TOMS veröffentlicht, in dem er darstellt wie man eine simple Matrizenmultiplikation auf einem modernen Prozessor optimieren kann. TLDR neben der notwendigen drei Schleifen für den eigentlichen Algorithmus muss man je Cacheebene drei weitere Schleifen in den Code einbauen. Die Erfahrung zeigt allerdings, dass die für den L1 Cache der Compiler das selbst ganz gut hinbekommt, bzw. man besser von Hand SIMD-Code dafür schreibt. Aber die anderen Schleifen muss man umsetzen, und so die Anzahl der Cache Misses minimieren.

Daher war mein Gedankengang, dass ich explizit nicht versuche das so allgemein wie möglich zu halten, sondern die Eigenschaften Speicherordnung und Transposition als Typen kodierte, um dann die Möglichkeit zu haben über eine Template-Spezialisierung direkt eine BLAS- bzw. LAPACK-Implementation zu nutzen. Da kommt man dann bei einem Aufruf ohne großen Aufwand aus, und übergibt direkt den Zeiger auf den Speicher an die entsprechende Routine.

Ich habe das mit den unterschiedlichen Ordnungen mal sehr generisch gelöst, indem ich den Matrizen row_stride und col_stride Integer-Attribute gegeben habe. Das sind die Anzahl der Elemente, die man überspringen muss, um in die jeweils nächste Zeile oder Spalte zu kommen. Für eine $m \times n$ Matrix ist bei Row-Major-Ordnung row_stride = $n$ und col_stride = 1 und bei Colum-Major-Anordung row_stride = 1 und col_stride = $m$ . Damit wird die Index-Berechnung zu col_stride * j + row_stride * i, egal in welcher Ordung die Matrix vorliegt - so kann man an einigen Stellen auch auf einen Branch zur Fallunterscheidung zwischen Row/Column Major verzichten. Auch eventuelles Padding, z.B. so dass Zeilen/Spalten im Speicher immer ein vielfaches der Breite von SIMD-Datentypen sind, lässt sich damit gut handhaben.

Der Ansatz ist interessant, wobei für das SIMD-Speicherlayout bereits die Leading Dimension ausreichen ist. Ich werde das im Hinterkopf behalten und dann schauen was man daraus machen kann.

P.S. Danke fürs Feedback