Example Code (Cache Optimization)#

A simple timing helper class (StopWatch.hpp)#

#pragma once

#include <chrono>

class StopWatch
{

private:

    using clock_type = std::chrono::system_clock;
    using time_type = std::chrono::time_point<clock_type>;

public:

    /// A singleton.
    static StopWatch & me()
    {
        static StopWatch instance;
        return instance;
    }

    StopWatch() : m_start(clock_type::now()), m_stop(m_start) {}

    StopWatch(StopWatch const & ) = default;
    StopWatch(StopWatch       &&) = default;
    StopWatch & operator=(StopWatch const & ) = default;
    StopWatch & operator=(StopWatch       &&) = default;
    ~StopWatch() = default;

    /**
     * Return seconds between laps.
     */
    double lap()
    {
        m_start = m_stop;
        m_stop = clock_type::now();
        return std::chrono::duration<double>(m_stop - m_start).count();
    }

    /**
     * Return seconds between end and start.
     */
    double duration() const { return std::chrono::duration<double>(m_stop - m_start).count(); }

    /**
     * Return resolution in second.
     */
    static constexpr double resolution()
    {
        return double(clock_type::period::num) / double(clock_type::period::den);
    }

private:

    time_type m_start;
    time_type m_stop;

}; /* end struct StopWatch */

Example code for skipped access (01_skip_access.cpp)#

#include "StopWatch.hpp"

#include <cstddef>
#include <iostream>

int main(int argc, char ** argv)
{
    constexpr const size_t nelem = 128 * 1024 * 1024;
    int * arr = new int[nelem];
    double elapsed;

    StopWatch sw;

    // Sequential; accessing all data every 4 bytes.
    for (size_t i=0; i<nelem; ++i) { arr[i] = i; }
    sw.lap();
    for (size_t i=0; i<nelem; ++i) { arr[i] *= 3; }
    elapsed = sw.lap();
    std::cout << "Sequential takes: " << elapsed << " sec" << std::endl;

    std::cout << std::endl;

    // Skipping 2; accessing 4 bytes every 8 bytes.
    for (size_t i=0; i<nelem; ++i) { arr[i] = i; }
    sw.lap();
    for (size_t i=0; i<nelem; i+=2) { arr[i] *= 3; }
    elapsed = sw.lap();
    std::cout << "Skipping 2 takes: " << elapsed << " sec" << std::endl;

    // Skipping 4; accessing 4 bytes every 16 bytes.
    for (size_t i=0; i<nelem; ++i) { arr[i] = i; }
    sw.lap();
    for (size_t i=0; i<nelem; i+=4) { arr[i] *= 3; }
    elapsed = sw.lap();
    std::cout << "Skipping 4 takes: " << elapsed << " sec" << std::endl;

    // Skipping 8; accessing 4 bytes every 32 bytes.
    for (size_t i=0; i<nelem; ++i) { arr[i] = i; }
    sw.lap();
    for (size_t i=0; i<nelem; i+=8) { arr[i] *= 3; }
    elapsed = sw.lap();
    std::cout << "Skipping 8 takes: " << elapsed << " sec" << std::endl;

    // Skipping 16; accessing 4 bytes every 64 bytes.
    for (size_t i=0; i<nelem; ++i) { arr[i] = i; }
    sw.lap();
    for (size_t i=0; i<nelem; i+=16) { arr[i] *= 3; }
    elapsed = sw.lap();
    std::cout << "Skipping 16 takes: " << elapsed << " sec" << std::endl;

    std::cout << std::endl;

    // Skipping 32; accessing 4 bytes every 128 bytes.
    for (size_t i=0; i<nelem; ++i) { arr[i] = i; }
    sw.lap();
    for (size_t i=0; i<nelem; i+=32) { arr[i] *= 3; }
    elapsed = sw.lap();
    std::cout << "Skipping 32 takes: " << elapsed << " sec" << std::endl;

    // Skipping 64; accessing 4 bytes every 256 bytes.
    for (size_t i=0; i<nelem; ++i) { arr[i] = i; }
    sw.lap();
    for (size_t i=0; i<nelem; i+=64) { arr[i] *= 3; }
    elapsed = sw.lap();
    std::cout << "Skipping 64 takes: " << elapsed << " sec" << std::endl;

    // Skipping 128; accessing 4 bytes every 512 bytes.
    for (size_t i=0; i<nelem; ++i) { arr[i] = i; }
    sw.lap();
    for (size_t i=0; i<nelem; i+=128) { arr[i] *= 3; }
    elapsed = sw.lap();
    std::cout << "Skipping 128 takes: " << elapsed << " sec" << std::endl;

    // Skipping 256; accessing 4 bytes every 1024 bytes.
    for (size_t i=0; i<nelem; ++i) { arr[i] = i; }
    sw.lap();
    for (size_t i=0; i<nelem; i+=256) { arr[i] *= 3; }
    elapsed = sw.lap();
    std::cout << "Skipping 256 takes: " << elapsed << " sec" << std::endl;

    // Skipping 512; accessing 4 bytes every 2048 bytes.
    for (size_t i=0; i<nelem; ++i) { arr[i] = i; }
    sw.lap();
    for (size_t i=0; i<nelem; i+=512) { arr[i] *= 3; }
    elapsed = sw.lap();
    std::cout << "Skipping 512 takes: " << elapsed << " sec" << std::endl;

    // Skipping 1024; accessing 4 bytes every 4096 bytes.
    for (size_t i=0; i<nelem; ++i) { arr[i] = i; }
    sw.lap();
    for (size_t i=0; i<nelem; i+=1024) { arr[i] *= 3; }
    elapsed = sw.lap();
    std::cout << "Skipping 1024 takes: " << elapsed << " sec" << std::endl;

    delete[] arr;
}

Example code for data layout (02_locality.cpp)#

#include "StopWatch.hpp"

#include <cstddef>
#include <iostream>
#include <time.h>

int main(int argc, char ** argv)
{
    constexpr const size_t nelem = 1024 * 1024 * 64;
    // Assume row-major 2D array.
    size_t ncol = 1;
    size_t nrow = nelem / ncol;

    double * buffer = new double[nelem];
    StopWatch sw;
    double elapsed;

    for (size_t ncol=1; ncol<=1024 * 8; ncol *= 2)
    //for (size_t ncol=1024; ncol>0; ncol /= 2)
    {

        // Pre-populate to cancel the effect of overcommit or delayed allocation.
        for (size_t i=0; i<nelem; ++i) { buffer[i] = nelem-i; }

        size_t nrow = nelem / ncol;
        std::cout << "# of elements: " << nelem << " = "
                  << nrow << " x " << ncol << std::endl;

        // Populate flatly.
        sw.lap();
        for (size_t i=0; i<nelem; ++i) { buffer[i] = i; }
        elapsed = sw.lap();
        std::cout << "populate double flatly takes: "
                  << elapsed << " sec" << std::endl;

        // Populate by last index.
        sw.lap();
        for (size_t i=0; i<nrow; ++i) // the i-th row
        {
            for (size_t j=0; j<ncol; ++j) // the j-th column
            {
                buffer[i*ncol + j] = i*ncol + j;
            }
        }
        double elapsed_last = elapsed = sw.lap();
        std::cout << "populate double along last axis takes: "
                  << elapsed << " sec" << std::endl;

        // Populate by first index.
        sw.lap();
        for (size_t j=0; j<ncol; ++j) // the j-th column
        {
            for (size_t i=0; i<nrow; ++i) // the i-th row
            {
                buffer[i*ncol + j] = i*ncol + j;
            }
        }
        double elapsed_first = elapsed = sw.lap();
        std::cout << "populate double along first axis takes: "
                  << elapsed << " sec" << std::endl;

        std::cout << "ratio: " << elapsed_first / elapsed_last
                  << std::endl << std::endl;

    }

    for (size_t i=0; i<nelem; ++i) { buffer[i] += i; }
    delete[] buffer;

    return 0;
}

Example code for tiled matrix-matrix multiplication (03_matrxi_matrix.cpp)#

#include "StopWatch.hpp"

#ifdef HASMKL
#include <mkl.h>
#include <mkl_lapack.h>
#include <mkl_lapacke.h>
#else // HASMKL
#ifdef __MACH__
#include <clapack.h>
#include <Accelerate/Accelerate.h>
#endif // __MACH__
#endif // HASMKL

#include <iostream>
#include <sstream>
#include <iomanip>
#include <vector>
#include <stdexcept>
#include <functional>

struct Matrix {

public:

    Matrix(size_t nrow, size_t ncol)
      : m_nrow(nrow), m_ncol(ncol)
    {
        reset_buffer(nrow, ncol);
    }

    Matrix(size_t nrow, size_t ncol, std::vector<double> const & vec)
      : m_nrow(nrow), m_ncol(ncol)
    {
        reset_buffer(nrow, ncol);
        (*this) = vec;
    }

    Matrix & operator=(std::vector<double> const & vec)
    {
        if (size() != vec.size())
        {
            throw std::out_of_range("number of elements mismatch");
        }

        size_t k = 0;
        for (size_t i=0; i<m_nrow; ++i)
        {
            for (size_t j=0; j<m_ncol; ++j)
            {
                (*this)(i,j) = vec[k];
                ++k;
            }
        }

        return *this;
    }

    Matrix(Matrix const & other)
      : m_nrow(other.m_nrow), m_ncol(other.m_ncol)
      , m_elapsed(other.m_elapsed), m_nflo(other.m_nflo)
    {
        reset_buffer(other.m_nrow, other.m_ncol);
        for (size_t i=0; i<m_nrow; ++i)
        {
            for (size_t j=0; j<m_ncol; ++j)
            {
                (*this)(i,j) = other(i,j);
            }
        }
    }

    Matrix & operator=(Matrix const & other)
    {
        if (this == &other) { return *this; }
        if (m_nrow != other.m_nrow || m_ncol != other.m_ncol)
        {
            reset_buffer(other.m_nrow, other.m_ncol);
        }
        for (size_t i=0; i<m_nrow; ++i)
        {
            for (size_t j=0; j<m_ncol; ++j)
            {
                (*this)(i,j) = other(i,j);
            }
        }
        m_elapsed = other.m_elapsed;
        m_nflo = other.m_nflo;
        return *this;
    }

    Matrix(Matrix && other)
      : m_nrow(other.m_nrow), m_ncol(other.m_ncol)
      , m_elapsed(other.m_elapsed), m_nflo(other.m_nflo)
    {
        reset_buffer(0, 0);
        std::swap(m_nrow, other.m_nrow);
        std::swap(m_ncol, other.m_ncol);
        std::swap(m_buffer, other.m_buffer);
    }

    Matrix & operator=(Matrix && other)
    {
        if (this == &other) { return *this; }
        reset_buffer(0, 0);
        std::swap(m_nrow, other.m_nrow);
        std::swap(m_ncol, other.m_ncol);
        std::swap(m_buffer, other.m_buffer);
        std::swap(m_elapsed, other.m_elapsed);
        std::swap(m_nflo, other.m_nflo);
        return *this;
    }

    ~Matrix()
    {
        reset_buffer(0, 0);
    }

    double   operator() (size_t row, size_t col) const { return m_buffer[index(row, col)]; }
    double & operator() (size_t row, size_t col)       { return m_buffer[index(row, col)]; }

    double   operator[] (size_t idx) const { return m_buffer[idx]; }
    double & operator[] (size_t idx)       { return m_buffer[idx]; }

    size_t nrow() const { return m_nrow; }
    size_t ncol() const { return m_ncol; }

    size_t size() const { return m_nrow * m_ncol; }
    double buffer(size_t i) const { return m_buffer[i]; }
    std::vector<double> buffer_vector() const { return std::vector<double>(m_buffer, m_buffer+size()); }

    double   elapsed() const { return m_elapsed; }
    double & elapsed()       { return m_elapsed; }

    size_t   nflo() const { return m_nflo; }
    size_t & nflo()       { return m_nflo; }

    double gflops() const { return m_nflo / m_elapsed / 1.e9; }

    Matrix transpose() const;

public:

    size_t index(size_t row, size_t col) const
    {
        return row * m_ncol + col;
    }

    void reset_buffer(size_t nrow, size_t ncol)
    {
        if (m_buffer) { delete[] m_buffer; }
        const size_t nelement = nrow * ncol;
        if (nelement) { m_buffer = new double[nelement]; }
        else          { m_buffer = nullptr; }
        m_nrow = nrow;
        m_ncol = ncol;
    }

    size_t m_nrow = 0;
    size_t m_ncol = 0;
    double * m_buffer = nullptr;
    double m_elapsed = 0;
    size_t m_nflo = 0; // number of floating-point operations.

};

Matrix Matrix::transpose() const
{
    Matrix ret(nrow(), ncol());

    for (size_t i=0; i<ret.nrow(); ++i)
    {
        for (size_t j=0; j<ret.ncol(); ++j)
        {
            ret(j, i) = (*this)(i, j);
        }
    }

    return ret;
}

bool operator== (Matrix const & mat1, Matrix const & mat2)
{
    if ((mat1.ncol() != mat2.ncol()) && (mat1.nrow() != mat2.ncol()))
    {
        return false;
    }

    for (size_t i=0; i<mat1.nrow(); ++i)
    {
        for (size_t j=0; j<mat1.ncol(); ++j)
        {
            if (mat1(i, j) != mat2(i, j))
            {
                return false;
            }
        }
    }

    return true;
}

bool operator!= (Matrix const & mat1, Matrix const & mat2)
{
    return !(mat1 == mat2);
}

/*
 * Throw an exception if the shapes of the two matrices don't support
 * multiplication.
 */
void validate_multiplication(Matrix const & mat1, Matrix const & mat2)
{
    if (mat1.ncol() != mat2.nrow())
    {
        throw std::out_of_range(
            "the number of first matrix column "
            "differs from that of second matrix row");
    }
}

/*
 * Get the number of floating-point operations.
 */
size_t calc_nflo(Matrix const & mat1, Matrix const & mat2)
{
    return mat1.nrow() * mat1.ncol() * mat2.ncol();
}

/*
 * Use MKL for the matrix matrix multiplication.
 */
Matrix multiply_mkl(Matrix const & mat1, Matrix const & mat2)
{
#if !defined(HASMKL) || defined(NOMKL)
    // run with VECLIB_MAXIMUM_THREADS=1
#else // HASMKL NOMKL
    mkl_set_num_threads(1);
#endif // HASMKL NOMKL

    Matrix ret(mat1.nrow(), mat2.ncol());

    StopWatch sw;

    cblas_dgemm(
        CblasRowMajor /* const CBLAS_LAYOUT Layout */
      , CblasNoTrans /* const CBLAS_TRANSPOSE transa */
      , CblasNoTrans /* const CBLAS_TRANSPOSE transb */
      , mat1.nrow() /* const MKL_INT m */
      , mat2.ncol() /* const MKL_INT n */
      , mat1.ncol() /* const MKL_INT k */
      , 1.0 /* const double alpha */
      , mat1.m_buffer /* const double *a */
      , mat1.ncol() /* const MKL_INT lda */
      , mat2.m_buffer /* const double *b */
      , mat2.ncol() /* const MKL_INT ldb */
      , 0.0 /* const double beta */
      , ret.m_buffer /* double * c */
      , ret.ncol() /* const MKL_INT ldc */
    );

    ret.elapsed() = sw.lap();
    ret.nflo() = calc_nflo(mat1, mat2);

    return ret;
}

/*
 * Indirect naive matrix matrix multiplication.
 */
Matrix multiply_indirect(Matrix const & mat1, Matrix const & mat2)
{
    validate_multiplication(mat1, mat2);

    Matrix ret(mat1.nrow(), mat2.ncol());

    StopWatch sw;

    for (size_t i=0; i<mat1.nrow(); ++i)
    {
        for (size_t k=0; k<mat2.ncol(); ++k)
        {
            double v = 0;
            for (size_t j=0; j<mat1.ncol(); ++j)
            {
                v += mat1(i,j) * mat2(j,k);
            }
            ret(i,k) = v;
        }
    }

    ret.elapsed() = sw.lap();
    ret.nflo() = calc_nflo(mat1, mat2);

    return ret;
}

/*
 * Indirect naive matrix matrix multiplication but loop in a different order
 * (i,j,k).  It is much faster than the original order (i,k,j), because mat2
 * now has a much better locality.
 */
Matrix multiply_indirect_order1(Matrix const & mat1, Matrix const & mat2)
{
    validate_multiplication(mat1, mat2);

    Matrix ret(mat1.nrow(), mat2.ncol());

    StopWatch sw;

    for (size_t i=0; i<ret.nrow(); ++i)
    {
        for (size_t j=0; j<ret.ncol(); ++j)
        {
            ret(i,j) = 0;
        }
    }

    for (size_t i=0; i<mat1.nrow(); ++i)
    {
        for (size_t j=0; j<mat1.ncol(); ++j)
        {
            for (size_t k=0; k<mat2.ncol(); ++k)
            {
                ret(i,k) += mat1(i,j) * mat2(j,k);
            }
        }
    }

    ret.elapsed() = sw.lap();
    ret.nflo() = calc_nflo(mat1, mat2);

    return ret;
}

/*
 * Direct naive matrix matrix multiplication.
 */
Matrix multiply_direct(Matrix const & mat1, Matrix const & mat2)
{
    validate_multiplication(mat1, mat2);

    Matrix ret(mat1.nrow(), mat2.ncol());

    const size_t nrow1 = mat1.nrow();
    const size_t ncol1 = mat1.ncol();
    const size_t nrow2 = mat2.nrow();
    const size_t ncol2 = mat2.ncol();

    StopWatch sw;

    for (size_t i=0; i<nrow1; ++i)
    {
        const size_t base1 = i * ncol1;
        for (size_t k=0; k<ncol2; ++k)
        {
            double v = 0;
            for (size_t j=0; j<ncol1; ++j)
            {
                v += mat1.m_buffer[base1 + j] * mat2.m_buffer[j*ncol2 + k];
            }
            ret.m_buffer[base1 + k] = v;
        }
    }

    ret.elapsed() = sw.lap();
    ret.nflo() = calc_nflo(mat1, mat2);

    return ret;
}

template<size_t N>
struct Block
{
    static constexpr const size_t NDIM = N;

    double   operator[] (size_t idx) const { return m_buffer[idx]; }
    double & operator[] (size_t idx)       { return m_buffer[idx]; }

    Block<N> & operator= (double v)
    {
        for (size_t i=0; i<N*N; ++i) { m_buffer[i] = v; }
        return *this;
    }

    Block<N> & operator+= (Block<N> const & other)
    {
        for (size_t i=0; i<N*N; ++i) { m_buffer[i] += other.m_buffer[i]; }
        return *this;
    }

    void save(Matrix & mat, size_t it, size_t jt);

    double m_buffer[N * N];
};

template<size_t N> void Block<N>::save(
    Matrix & mat, size_t it, size_t jt
)
{
    const size_t ncol = mat.ncol();

    for (size_t i=0; i<NDIM; ++i)
    {
        const size_t base_s = i*NDIM;
        const size_t base_t = (it*NDIM + i) * ncol + jt*NDIM;

        for (size_t j=0; j<NDIM; ++j)
        {
            mat.m_buffer[base_t + j] = m_buffer[base_s + j];
        }
    }
}

template<size_t N>
struct Tiler
{
    static constexpr const size_t NDIM = N;

    void load(
        Matrix const & mat1, size_t it1, size_t jt1
      , Matrix const & mat2, size_t it2, size_t jt2
    );

    void multiply();

    Block<N> m_mat1; // row-major
    Block<N> m_mat2; // column-major
    Block<N> m_ret; // row-major
};

template<size_t N> void Tiler<N>::load(
    Matrix const & mat1, size_t it1, size_t jt1
  , Matrix const & mat2, size_t it2, size_t jt2
)
{
    const size_t ncol1 = mat1.ncol();

    for (size_t i=0; i<NDIM; ++i)
    {
        const size_t base_t = i*NDIM;
        const size_t base_s = (it1*NDIM + i) * ncol1 + jt1*NDIM;

        for (size_t j=0; j<NDIM; ++j)
        {
            m_mat1[base_t + j] = mat1.m_buffer[base_s + j];
        }
    }

    const size_t ncol2 = mat2.ncol();

    for (size_t i=0; i<NDIM; ++i)
    {
        const size_t base_t = i*NDIM;
        const size_t base_s = (it2*NDIM + i) * ncol2 + jt2*NDIM;

        for (size_t j=0; j<NDIM; ++j)
        {
            m_ret[base_t + j] = mat2.m_buffer[base_s + j];
        }
    }

    for (size_t i=0; i<NDIM; ++i)
    {
        const size_t base = i*NDIM;

        for (size_t j=0; j<NDIM; ++j)
        {
            m_mat2[j*NDIM + i] = m_ret[base + j];
        }
    }
}

template<size_t N> void Tiler<N>::multiply()
{
    for (size_t i=0; i<NDIM; ++i)
    {
        const size_t base1 = i*NDIM;

        for (size_t k=0; k<NDIM; ++k)
        {
            const size_t base2 = k*NDIM;

            double v = 0;
            for (size_t j=0; j<NDIM; ++j)
            {
                v += m_mat1[base1 + j] * m_mat2[base2 + j];
            }
            m_ret[base1 + k] = v;
        }
    }
}

/*
 * Tiled matrix matrix multiplication.
 */
template<size_t LSIZE>
Matrix multiply_tile(Matrix const & mat1, Matrix const & mat2)
{
    validate_multiplication(mat1, mat2);

    Matrix ret(mat1.nrow(), mat2.ncol());

    constexpr const size_t tsize = LSIZE / sizeof(double);

    const size_t nrow1 = mat1.nrow();
    const size_t ncol1 = mat1.ncol();
    const size_t nrow2 = mat2.nrow();
    const size_t ncol2 = mat2.ncol();

    const size_t ntrow1 = nrow1 / tsize;
    const size_t ntcol1 = ncol1 / tsize;
    const size_t ntrow2 = nrow2 / tsize;
    const size_t ntcol2 = ncol2 / tsize;

    Block<tsize> value;
    Tiler<tsize> tiler;

    StopWatch sw;

    for (size_t it=0; it<ntrow1; ++it)
    {
        for (size_t kt=0; kt<ntcol2; ++kt)
        {
            value = 0;
            for (size_t jt=0; jt<ntcol1; ++jt)
            {
                tiler.load(mat1, it, jt, mat2, jt, kt);
                tiler.multiply();
                value += tiler.m_ret;
            }
            value.save(ret, it, kt);
        }
    }

    ret.elapsed() = sw.lap();
    ret.nflo() = calc_nflo(mat1, mat2);

    return ret;
}

void initialize(Matrix & mat)
{
    for (size_t i=0; i<mat.nrow(); ++i)
    {
        for (size_t j=0; j<mat.ncol(); ++j)
        {
            if (0 == i%2)
            {
                mat(i, j) = j;
            }
            else
            {
                mat(i, j) = mat.ncol() - 1 - j;
            }
        }
    }
}

template<size_t TSIZE=sizeof(double)>
Matrix time_tile(
    std::string tag, Matrix const * gold
  , Matrix const & mat1, Matrix const & mat2
)
{
    std::function<Matrix (Matrix const &, Matrix const &)> runner;

    if (sizeof(double) == TSIZE)
    {
        if ("mkl" == tag)
        {
            runner = multiply_mkl;
        }
        else if ("indirect" == tag)
        {
            runner = multiply_indirect;
        }
        else if ("indirect_order1" == tag)
        {
            runner = multiply_indirect_order1;
        }
        else if ("direct" == tag)
        {
            runner = multiply_direct;
        }
        else
        {
            throw std::runtime_error("invalid tag");
        }
    }
    else
    {
        std::ostringstream t;
        t << "tiled " << TSIZE;
        tag = t.str();
        runner = multiply_tile<TSIZE>;
    }

    std::cout << "Timing " << tag << ": ";

    Matrix res = runner(mat1, mat2);

    if (gold && (res != *gold))
    {
        throw std::runtime_error("answer mismatch");
    }

    std::cout << res.elapsed() << " second, "
              << res.nflo()/1.e9 << " Gflo, "
              << res.gflops() << " Gflops"
              << std::endl;

    return res;
}

int main(int argc, char ** argv)
{
    Matrix mat1(1 * 1024, 1 * 1024);
    initialize(mat1);
    Matrix mat2 = mat1;

    Matrix mat_gold = time_tile("mkl", nullptr, mat1, mat2);

    time_tile("indirect", &mat_gold, mat1, mat2);
    time_tile("indirect_order1", &mat_gold, mat1, mat2);
    time_tile("direct", &mat_gold, mat1, mat2);

    time_tile<32>("", &mat_gold, mat1, mat2);
    time_tile<64>("", &mat_gold, mat1, mat2);
    time_tile<128>("", &mat_gold, mat1, mat2);
    time_tile<256>("", &mat_gold, mat1, mat2);
    time_tile<512>("", &mat_gold, mat1, mat2);
    time_tile<1024>("", &mat_gold, mat1, mat2);

    return 0;
}