pure function mat_mat_block_rel(a, b, transA, transB, option, nblock) result(c)
real(rk), intent(in), contiguous :: a(:,:), b(:,:)
logical, intent(in), optional :: transA, transB
character(*), intent(in), optional :: option
integer, intent(in) :: nblock
real(rk), allocatable :: c(:,:)
integer :: ib, se, ee
integer :: block_size(nblock), start_elem(nblock), end_elem(nblock)
if (present(transA) .and. present(transB)) then
if (.not.transA .and. .not.transB) then
! AB
allocate(C(size(A,1), size(B,2)), source=0.0_rk)
call compute_block_ranges(size(B,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock)
se = start_elem(ib)
ee = end_elem(ib)
C(:, se:ee) = &
C(:, se:ee) + matmul(A, B(:,se:ee), transA=.false., transB=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(:, se:ee) = &
C(:, se:ee) + matmul(A, B(:,se:ee), transA=.false., transB=.false., option=option)
end do
#endif
else if (transA .and. transB) then
! ATBT
allocate(C(size(A,2), size(B,1)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock)
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.true., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.true., option=option)
end do
#endif
else if (transA .and. .not.transB) then
! ATB
allocate(C(size(A,2), size(B,2)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock)
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#endif
else if (.not.transA .and. transB) then
! ABT
allocate(C(size(A,1), size(B,1)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock)
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#endif
end if
else if (present(transA) .or. present(transB)) then
if (present(transA)) then
if (transA) then
! ATB
allocate(C(size(A,2), size(B,2)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock)
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#endif
else if (.not.transA) then
! ABT
allocate(C(size(A,1), size(B,1)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock)
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#endif
end if
else if (present(transB)) then
if (transB) then
! ABT
allocate(C(size(A,1), size(B,1)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock)
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(:, :) = C(:, :) + &
matmul(A(:, se:ee), B(:,se:ee), transA=.false., transB=.true., option=option)
end do
#endif
else if (.not.transB) then
! ATB
allocate(C(size(A,2), size(B,2)), source=0.0_rk)
call compute_block_ranges(size(A,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock)
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(se:ee, :) = &
C(se:ee, :) + matmul(A(:, se:ee), B, transA=.true., transB=.false., option=option)
end do
#endif
end if
end if
else if (.not.present(transA) .and. .not.present(transB)) then
! AB
allocate(C(size(A,1), size(B,2)), source=0.0_rk)
call compute_block_ranges(size(B,2), nblock, block_size, start_elem, end_elem)
#if defined(USE_DO_CONCURRENT)
do concurrent (ib = 1: nblock)
se = start_elem(ib)
ee = end_elem(ib)
C(:, se:ee) = &
C(:, se:ee) + matmul(A, B(:,se:ee), transA=.false., transB=.false., option=option)
end do
#else
do ib = 1, nblock
se = start_elem(ib)
ee = end_elem(ib)
C(:, se:ee) = &
C(:, se:ee) + matmul(A, B(:,se:ee), transA=.false., transB=.false., option=option)
end do
#endif
end if
end function mat_mat_block_rel