impure function mat_mat_coarray_rel(a, b, transA, transB, option, coarray) result(c)
real(rk), intent(in), contiguous :: a(:,:), b(:,:)
character(*), intent(in), optional :: option
logical, intent(in), optional :: transA, transB
real(rk), allocatable :: c(:,:)
logical, intent(in) :: coarray
#if defined (USE_COARRAY)
integer :: i, im, nimg, n, m
integer :: block_size(num_images()), start_elem(num_images()), end_elem(num_images())
real(rk), allocatable :: C_block(:,:)[:], B_block(:,:)[:], A_block(:,:)[:]
if (present(transA) .and. present(transB)) then
if (.not.transA .and. .not.transB) then
! AB
allocate(C(size(A,1), size(B,2)), source=0.0_rk)
im = this_image()
nimg = num_images()
m = size(A,1)
n = size(A,2)
call compute_block_ranges(size(B,2), nimg, block_size, start_elem, end_elem)
allocate(B_block(n, block_size(im))[*], C_block(m, block_size(im))[*])
B_block(:,:)[im] = B(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A, B_block(:,:)[im], transA=.false., transB=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
C(:,start_elem(i):end_elem(i)) = C_block(:,:)[i]
end do
end if
else if (transA .and. transB) then
! ATBT
allocate(C(size(A,2), size(B,1)), source=0.0_rk)
im = this_image()
nimg = num_images()
m = size(A,1)
n = size(A,2)
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(A_block(m, block_size(im))[*], C_block(block_size(im), size(B,1))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:,:)[im], B, transA=.true., transB=.true., option=option)
sync all
if (im == 1) then
do i = 1, nimg
C(start_elem(i):end_elem(i), :) = C_block(:,:)[i]
end do
end if
else if (transA .and. .not.transB) then
! ATB
allocate(C(size(A,2), size(B,2)), source=0.0_rk)
im = this_image()
nimg = num_images()
m = size(A,1)
n = size(A,2)
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(A_block(m, block_size(im))[*], C_block(block_size(im), size(B,2))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:, :)[im], B, transA=.true., transB=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
C(start_elem(i):end_elem(i), :) = C_block(:,:)[i]
end do
end if
else if (.not.transA .and. transB) then
! ABT
allocate(C(size(A,1), size(B,1)), source=0.0_rk)
im = this_image()
nimg = num_images()
m = size(A,1)
n = size(A,2)
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(A_block(m, block_size(im))[*], B_block(size(B,1), block_size(im))[*])
allocate(C_block(m, size(B,1))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
B_block(:,:)[im] = B(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:,:)[im], B_block(:,:)[im], transA=.false., transB=.true., option=option)
sync all
if (im == 1) then
do i = 1, nimg
C(:, :) = C(:, :) + C_block(:,:)[i]
end do
end if
end if
else if (present(transA) .or. present(transB)) then
if (present(transA)) then
if (transA) then
! ATB
allocate(C(size(A,2), size(B,2)), source=0.0_rk)
im = this_image()
nimg = num_images()
m = size(A,1)
n = size(A,2)
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(A_block(m, block_size(im))[*], C_block(block_size(im), size(B,2))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:, :)[im], B, transA=.true., transB=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
C(start_elem(i):end_elem(i), :) = C_block(:,:)[i]
end do
end if
else if (.not.transA) then
! ABT
allocate(C(size(A,1), size(B,1)), source=0.0_rk)
im = this_image()
nimg = num_images()
m = size(A,1)
n = size(A,2)
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(A_block(m, block_size(im))[*], B_block(size(B,1), block_size(im))[*])
allocate(C_block(m, size(B,1))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
B_block(:,:)[im] = B(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:,:)[im], B_block(:,:)[im], transA=.false., transB=.true., option=option)
sync all
if (im == 1) then
do i = 1, nimg
C(:, :) = C(:, :) + C_block(:,:)[i]
end do
end if
end if
else if (present(transB)) then
if (transB) then
! ABT
allocate(C(size(A,1), size(B,1)), source=0.0_rk)
im = this_image()
nimg = num_images()
m = size(A,1)
n = size(A,2)
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(A_block(m, block_size(im))[*], B_block(size(B,1), block_size(im))[*])
allocate(C_block(m, size(B,1))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
B_block(:,:)[im] = B(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:,:)[im], B_block(:,:)[im], transA=.false., transB=.true., option=option)
sync all
if (im == 1) then
do i = 1, nimg
C(:, :) = C(:, :) + C_block(:,:)[i]
end do
end if
else if (.not.transB) then
! ATB
allocate(C(size(A,2), size(B,2)), source=0.0_rk)
im = this_image()
nimg = num_images()
m = size(A,1)
n = size(A,2)
call compute_block_ranges(size(A,2), nimg, block_size, start_elem, end_elem)
allocate(A_block(m, block_size(im))[*], C_block(block_size(im), size(B,2))[*])
A_block(:,:)[im] = A(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A_block(:, :)[im], B, transA=.true., transB=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
C(start_elem(i):end_elem(i), :) = C_block(:,:)[i]
end do
end if
end if
end if
else if (.not.present(transA) .and. .not.present(transB)) then
! AB
allocate(C(size(A,1), size(B,2)), source=0.0_rk)
im = this_image()
nimg = num_images()
m = size(A,1)
n = size(A,2)
call compute_block_ranges(size(B,2), nimg, block_size, start_elem, end_elem)
allocate(B_block(n, block_size(im))[*], C_block(m, block_size(im))[*])
B_block(:,:)[im] = B(:, start_elem(im):end_elem(im))
C_block(:,:)[im] = matmul(A, B_block(:,:)[im], transA=.false., transB=.false., option=option)
sync all
if (im == 1) then
do i = 1, nimg
C(:,start_elem(i):end_elem(i)) = C_block(:,:)[i]
end do
end if
end if
#else
C = matmul(A, B, transA=transA, transB=transB, option=option)
#endif
end function mat_mat_coarray_rel