!-------------------------------------------------------------------------------
! Copyright (c) 2016 The University of Tokyo
! This software is released under the MIT License, see LICENSE.txt
!-------------------------------------------------------------------------------
module m_hecmw_matrix_get_fillin
  use hecmw_util
  use hecmw_matrix_reorder
  use hecmw_matrix_misc
  implicit none

  private
  public :: hecmw_matrix_reorder_halo
  public :: hecmw_matrix_get_Shur_comp
  public :: hecmw_matrix_get_fillin
  public :: hecmw_matrix_get_fillin_bin
  public :: hecmw_matrix_get_fillin_bin_asym
  public :: hecmw_matrix_reorder_matrix
  public :: hecmw_matrix_copy_with_fillin
  public :: hecmw_matrix_copy_with_fillin_asym_33
  public :: hecmw_matrix_copy_with_fillin_asym_44

  type hecmw_fill
    integer(kind=kint) :: N = 0
    integer(kind=kint), pointer :: node(:) => null()
  endtype hecmw_fill

  type hecmw_fill_bin
    integer(kind=kint) :: n_descendant
    integer(kind=kint) :: n_ancestor
    integer(kind=kint), pointer :: descendant(:)
    integer(kind=kint), pointer :: ancestor(:)
    integer(kind=kint), pointer :: update_index(:)
    real(kind=kreal), pointer :: update(:,:,:)
    logical :: factorized
    logical :: updated
  endtype hecmw_fill_bin
contains

  subroutine hecmw_matrix_reorder_halo(hecMESH, N, NP, perm, iperm)
    use hecmw_util
    implicit none
    type(hecmwST_local_mesh) :: hecMESH
    integer(kind=kint) :: perm(:), iperm(:)
    integer(kind=kint) :: i, in, N, NP
    integer(kind=kint), allocatable :: temp(:)

    allocate(temp(NP))
    temp = 0

    in = 1
    do i=1,NP
      if(perm(i) <= N)then
        temp(in) = perm(i)
        in = in + 1
      endif
    enddo
    do i=N+1,NP
      temp(i) = i
    enddo

    do i=1,NP
      perm(i) = temp(i)
    enddo

    do i=1,NP
      in = perm(i)
      iperm(in) = i
    enddo

    deallocate(temp)
  end subroutine hecmw_matrix_reorder_halo

  subroutine hecmw_matrix_get_Shur_comp()
    use hecmw_util
    implicit none

  end subroutine hecmw_matrix_get_Shur_comp

  subroutine hecmw_matrix_reorder_matrix(hecMESH, hecMAT, hecT, perm, iperm)
    use hecmw_util
    implicit none
    type(hecmwST_local_mesh) :: hecMESH
    type(hecmwST_matrix) :: hecMAT
    type(hecmwST_matrix) :: hecT
    integer(kind=kint) :: perm(:), iperm(:)
    integer(kind=kint) :: N, NP, ANPL, ANPU, NDOF, NDOF2

    N    = hecMAT%N
    NP   = hecMAT%NP
    ANPL = hecMAT%indexL(NP)
    ANPU = hecMAT%indexU(NP)
    NDOF = hecMAT%NDOF
    NDOF2= NDOF*NDOF

    call hecmw_nullify_matrix( hecT )
    hecT%N = N
    hecT%NP= NP
    allocate(hecT%indexL(0:NP))
    allocate(hecT%indexU(0:NP))
    allocate(hecT%itemL(ANPL))
    allocate(hecT%itemU(ANPU))
    call hecmw_matrix_reorder_profile(NP, perm, iperm, &
         hecMAT%indexL, hecMAT%indexU, hecMAT%itemL, hecMAT%itemU, &
         hecT%indexL, hecT%indexU, hecT%itemL, hecT%itemU)

    allocate(hecT%D(NDOF2*NP))
    allocate(hecT%AL(NDOF2*ANPL))
    allocate(hecT%AU(NDOF2*ANPU))
    call hecmw_matrix_reorder_values(NP, NDOF, perm, iperm, &
         hecMAT%indexL, hecMAT%indexU, hecMAT%itemL, hecMAT%itemU, &
         hecMAT%AL, hecMAT%AU, hecMAT%D, &
         hecT%indexL, hecT%indexU, hecT%itemL, hecT%itemU, hecT%AL, hecT%AU, hecT%D)
  end subroutine hecmw_matrix_reorder_matrix

  subroutine hecmw_matrix_get_fillin_bin(hecMESH, N, indexUp, itemUp, idxU, itemU, NPU)
    use hecmw_util
    implicit none
    type(hecmwST_local_mesh) :: hecMESH
    type(hecmwST_matrix) :: hecMAT
    type(hecmw_fill_bin), pointer:: T(:)
    integer(kind=kint), pointer :: indexUp(:)
    integer(kind=kint), pointer :: idxU(:)
    integer(kind=kint), pointer :: itemUp(:)
    integer(kind=kint), pointer :: itemU(:)
    integer(kind=kint) :: N, NPU
    integer(kind=kint) :: i, j, k, jS, jE, in, c
    integer(kind=kint) :: Nbytes
    integer(kind=kint) :: Start,End
    integer(kind=kint) :: range,parent
    integer(kind=kint) :: ZERO = 0
    integer(kind=kint), pointer :: Array(:)
    integer(kind=kint), pointer :: FillinMask(:)
    integer(kind=kint), pointer :: ChildMask(:)
    integer(kind=kint), pointer :: ParentMask(:)
    integer(kind=kint) :: bit = kint*8
    integer(kind=kint), allocatable :: count(:), diff(:)

    allocate(T(N))

    do i = 1,N
      T(i)%factorized = .false.
      T(i)%updated = .false.
      T(i)%n_descendant = 0
      T(i)%n_ancestor = 0
      jS = indexUp(i-1) + 1
      jE = indexUp(i  )
      T(i)%n_ancestor=jE - jS + 1
      allocate(T(i)%ancestor(T(i)%n_ancestor))
      in = 0
      do j = jS, jE
        in = in + 1
        T(i)%ancestor(in) = itemUp(j)
      enddo
    enddo

    Nbytes = N/bit+1
    allocate( ChildMask ( Nbytes ) )
    allocate( ParentMask( Nbytes ) )
    allocate( FillinMask( Nbytes ) )

    do i = 1,N
      if (T(i)%n_ancestor<2) cycle
      Start = i/bit + 1
      ChildMask(Start:Nbytes) = 0
      ParentMask(Start:Nbytes) = 0

      parent = T(i)%ancestor(1)
      range = 0
      do j = 2, T(i)%n_ancestor
        in = T(i)%ancestor(j)
        End = in/bit + 1
        ChildMask(End) = ibset(ChildMask(End),mod(in,bit))
        range = in
      enddo
      k = T(parent)%n_ancestor
      do j = 1, k
        in = T(parent)%ancestor(j)
        End = in/bit + 1
        ParentMask(End) = ibset(ParentMask(End),mod(in,bit))
        range = max(range,in)
      enddo
      End = range/bit + 1

      FillinMask(Start:End) = ior(  ChildMask(Start:End), ParentMask(Start:End)  )

      c = 0
      do j = Start, End
        c = c + popcnt(FillinMask(j))
      enddo

      if (c > 0) then
        allocate( Array(c) )
        T( parent )%n_ancestor=c
        in=0
        do j = Start, End
          do k = 1, popcnt(FillinMask(j))
            in = in + 1
            c = popcnt( iand(FillinMask(j), - FillinMask(j)) -1 )
            FillinMask(j) = ibclr(FillinMask(j),c)
            Array(in) = bit*(j-1)+c
          enddo
        enddo
        deallocate( T(parent)%ancestor)
        T(parent)%ancestor => Array
      end if
      !j=N/20
      !if (j==0) j=1
      !if ( mod(i,j)==0 ) write(*,"(i0,a,i0,i5,a)") i,"/",N, 100*i/N,"% OK "

    enddo
    deallocate( ChildMask )
    deallocate( ParentMask )
    deallocate( FillinMask )

    !Descendantを作る．
    do i = 1,N
      do j = 1, T(i)%n_ancestor
        T( T(i)%ancestor(j) )%n_descendant = T( T(i)%ancestor(j) )%n_descendant + 1
      enddo
    enddo
    do i = 1,N
      allocate(T(i)%descendant(T(i)%n_descendant))
    enddo

    allocate( diff(N) )
    diff(:)=0
    do i = 1,N
      do j = 1, T(i)%n_ancestor
        diff( T(i)%ancestor(j) ) = diff( T(i)%ancestor(j) ) + 1
        T( T(i)%ancestor(j) )%descendant(diff( T(i)%ancestor(j) )) = i
      enddo
    enddo

    !count fill-in including diagonal entries
    allocate(idxU(0:N))
    in = 0
    idxU(0) = 0
    do i=1,N
      idxU(i) = idxU(i-1) + T(i)%n_ancestor + 1
      in = in + T(i)%n_ancestor + 1
    enddo
    NPU = in

    allocate(itemU(NPU))
    in = 0
    do i=1,N
      in = in + 1
      itemU(in) = i
      do j=1,T(i)%n_ancestor
        in = in + 1
        itemU(in) = T(i)%ancestor(j)
      enddo
    enddo
  end subroutine hecmw_matrix_get_fillin_bin

  subroutine hecmw_matrix_get_fillin_bin_asym(hecMESH, hecMAT, idxU, itemU, NPU, idxL, itemL, NPL)
    use hecmw_util
    implicit none
    type(hecmwST_local_mesh) :: hecMESH
    type(hecmwST_matrix) :: hecMAT
    type(hecmw_fill_bin), pointer:: T(:)
    integer(kind=kint), pointer :: idxU(:)
    integer(kind=kint), pointer :: itemU(:)
    integer(kind=kint), pointer :: idxL(:)
    integer(kind=kint), pointer :: itemL(:)
    integer(kind=kint) :: N, NPU, NPL
    integer(kind=kint) :: i, j, k, jS, jE, in, c
    integer(kind=kint) :: Nbytes
    integer(kind=kint) :: Start,End
    integer(kind=kint) :: range,parent
    integer(kind=kint) :: ZERO = 0
    integer(kind=kint), pointer :: Array(:)
    integer(kind=kint), pointer :: FillinMask(:)
    integer(kind=kint), pointer :: ChildMask(:)
    integer(kind=kint), pointer :: ParentMask(:)
    integer(kind=kint) :: bit = kint*8
    integer(kind=kint), allocatable :: count(:), diff(:)

    N = hecMAT%N
    allocate(T(N))

    do i = 1,N
      T(i)%factorized = .false.
      T(i)%updated = .false.
      T(i)%n_descendant = 0
      T(i)%n_ancestor = 0
      jS = hecMAT%indexU(i-1) + 1
      jE = hecMAT%indexU(i  )
      T(i)%n_ancestor=jE - jS + 1
      allocate(T(i)%ancestor(T(i)%n_ancestor))
      in = 0
      do j = jS, jE
        in = in + 1
        T(i)%ancestor(in) = hecMAT%itemU(j)
      enddo
    enddo

    Nbytes = N/bit+1
    allocate( ChildMask ( Nbytes ) )
    allocate( ParentMask( Nbytes ) )
    allocate( FillinMask( Nbytes ) )

    do i = 1,N
      if (T(i)%n_ancestor<2) cycle
      Start = i/bit + 1
      ChildMask(Start:Nbytes) = 0
      ParentMask(Start:Nbytes) = 0

      parent = T(i)%ancestor(1)
      range = 0
      do j = 2, T(i)%n_ancestor
        in = T(i)%ancestor(j)
        End = in/bit + 1
        ChildMask(End) = ibset(ChildMask(End),mod(in,bit))
        range = in
      enddo
      k = T(parent)%n_ancestor
      do j = 1, k
        in = T(parent)%ancestor(j)
        End = in/bit + 1
        ParentMask(End) = ibset(ParentMask(End),mod(in,bit))
        range = max(range,in)
      enddo
      End = range/bit + 1

      FillinMask(Start:End) = ior(  ChildMask(Start:End), ParentMask(Start:End)  )

      c = 0
      do j = Start, End
        c = c + popcnt(FillinMask(j))
      enddo

      if (c > 0) then
        allocate( Array(c) )
        T( parent )%n_ancestor=c
        in=0
        do j = Start, End
          do k = 1, popcnt(FillinMask(j))
            in = in + 1
            c = popcnt( iand(FillinMask(j), - FillinMask(j)) -1 )
            FillinMask(j) = ibclr(FillinMask(j),c)
            Array(in) = bit*(j-1)+c
          enddo
        enddo
        deallocate( T(parent)%ancestor)
        T(parent)%ancestor => Array
      end if
      !j=N/20
      !if (j==0) j=1
      !if ( mod(i,j)==0 ) write(*,"(i0,a,i0,i5,a)") i,"/",N, 100*i/N,"% OK "

    enddo
    deallocate( ChildMask )
    deallocate( ParentMask )
    deallocate( FillinMask )

    !Descendantを作る．
    do i = 1,N
      do j = 1, T(i)%n_ancestor
        T( T(i)%ancestor(j) )%n_descendant = T( T(i)%ancestor(j) )%n_descendant + 1
      enddo
    enddo
    do i = 1,N
      allocate(T(i)%descendant(T(i)%n_descendant))
    enddo

    allocate( diff(N) )
    diff(:)=0
    do i = 1,N
      do j = 1, T(i)%n_ancestor
        diff( T(i)%ancestor(j) ) = diff( T(i)%ancestor(j) ) + 1
        T( T(i)%ancestor(j) )%descendant(diff( T(i)%ancestor(j) )) = i
      enddo
    enddo

    !count fill-in including diagonal entries
    allocate(idxU(0:N))
    in = 0
    idxU(0) = 0
    do i=1,N
      idxU(i) = idxU(i-1) + T(i)%n_ancestor + 1
      in = in + T(i)%n_ancestor + 1
    enddo
    NPU = in

    allocate(itemU(NPU))
    in = 0
    do i=1,N
      in = in + 1
      itemU(in) = i
      do j=1,T(i)%n_ancestor
        in = in + 1
        itemU(in) = T(i)%ancestor(j)
      enddo
    enddo

    !lower part
    allocate(count(N))
    count = 0

    allocate(idxL(0:N))
    idxL(0) = 0

    do i=1,idxU(N)
      in = itemU(i)
      count(in) = count(in) + 1
    enddo

    do i=1,N
      idxL(i) = idxL(i-1) + count(i)
    enddo

    NPL = NPU
    c = 1
    allocate(itemL(NPL))
    do i=1,N
      aa:do k=1,i
        jS = idxU(k-1) + 1
        jE = idxU(k)
        do j=jS,jE
          in = itemU(j)
          if(i < in) cycle aa
          if(i == in)then
            itemL(c) = k
            c = c + 1
            cycle aa
          endif
        enddo
      enddo aa
    enddo

    deallocate(count)
  end subroutine hecmw_matrix_get_fillin_bin_asym

  subroutine hecmw_matrix_get_fillin(hecMESH, hecMAT, idxU, itemU, NPU)
    use hecmw_util
    implicit none
    type(hecmwST_local_mesh) :: hecMESH
    type(hecmwST_matrix) :: hecMAT
    type(hecmw_fill),   pointer:: F(:)
    integer(kind=kint), pointer :: idxU(:)
    integer(kind=kint), pointer :: itemU(:)
    integer(kind=kint) :: N, NPU
    integer(kind=kint) :: i, j, k, iS, iE, jS, jE
    integer(kind=kint) :: in, jn ,kn, nn, FN
    integer(kind=kint) :: imax, imin
    integer(kind=kint), allocatable :: check(:), add(:), parent(:)

    N = hecMAT%N
    allocate(F(N))

    do i=1,N
      jS = hecMAT%indexU(i-1) + 1
      jE = hecMAT%indexU(i)
      in = jE - jS + 1
      F(i)%N = in
      allocate(F(i)%node(in))
      jn = 1
      do j=jS,jE
        in = hecMAT%itemU(j)
        F(i)%node(jn) = in
        jn = jn + 1
      enddo
    enddo

    !allocate(parent(N))
    !parent = 0
    do i=1,N
      !buget list
      in = F(i)%N
      imin = minval(F(i)%node)
      imax = maxval(F(i)%node)
      allocate(check(imin:imax))
      check(imin:imax) = 0
      do j=1,in
        check(F(i)%node(j)) = 1
      enddo
      nn = 0
      do j=imin,imax
        if(check(j)==1) nn = nn + 1
      enddo
      deallocate(F(i)%node)
      F(i)%N = nn
      allocate(F(i)%node(nn))
      in = 1
      do j=imin,imax
        if(check(j)==1)then
          F(i)%node(in) = j
          in = in + 1
        endif
      enddo
      deallocate(check)

      !add fill-in
      FN = F(i)%N
      if(1<FN)then
        in = F(i)%node(1)
        nn = FN - 1
        allocate(add(nn))
        add(1:nn) = F(i)%node(2:FN)
        call reallocate_array(F(in), nn, add)
        deallocate(add)
      endif
    enddo

    !count fill-in including diagonal entries
    allocate(idxU(0:N))
    in = 0
    idxU(0) = 0
    do i=1,N
      idxU(i) = idxU(i-1) + F(i)%N + 1
      in = in + F(i)%N + 1
    enddo
    NPU = in

    allocate(itemU(NPU))
    in = 0
    do i=1,N
      in = in + 1
      itemU(in) = i
      do j=1,F(i)%N
        in = in + 1
        itemU(in) = F(i)%node(j)
      enddo
    enddo
  end subroutine hecmw_matrix_get_fillin

  subroutine hecmw_matrix_copy_with_fillin(hecMESH, hecT, idxU, itemU, AU, NPU)
    use hecmw_util
    implicit none
    type(hecmwST_local_mesh) :: hecMESH
    type(hecmwST_matrix) :: hecT
    integer(kind=kint), pointer :: idxU(:)
    integer(kind=kint), pointer :: itemU(:)
    real(kind=kreal), pointer :: AU(:)
    integer(kind=kint) :: N, NPU
    integer(kind=kint) :: i, j, k, iS, iE, jS, jE
    integer(kind=kint) :: in, jn ,kn, nn

    N   = hecT%N

    !value
    allocate(AU(9*NPU))
    AU = 0.0d0

    do k=1,N
      in = idxU(k-1)+1
      AU(9*in-8) = hecT%D(9*k-8)
      AU(9*in-7) = hecT%D(9*k-7)
      AU(9*in-6) = hecT%D(9*k-6)
      AU(9*in-5) = hecT%D(9*k-5)
      AU(9*in-4) = hecT%D(9*k-4)
      AU(9*in-3) = hecT%D(9*k-3)
      AU(9*in-2) = hecT%D(9*k-2)
      AU(9*in-1) = hecT%D(9*k-1)
      AU(9*in  ) = hecT%D(9*k  )
    enddo

    do k=1,N
      iS = idxU(k-1) + 1
      iE = idxU(k)
      jS = hecT%indexU(k-1) + 1
      jE = hecT%indexU(k)
      aa:do j=jS,jE
        jn = hecT%itemU(j)
        do i=iS,iE
          in = itemU(i)
          if(jn == in)then
            AU(9*i-8) = hecT%AU(9*j-8)
            AU(9*i-7) = hecT%AU(9*j-7)
            AU(9*i-6) = hecT%AU(9*j-6)
            AU(9*i-5) = hecT%AU(9*j-5)
            AU(9*i-4) = hecT%AU(9*j-4)
            AU(9*i-3) = hecT%AU(9*j-3)
            AU(9*i-2) = hecT%AU(9*j-2)
            AU(9*i-1) = hecT%AU(9*j-1)
            AU(9*i  ) = hecT%AU(9*j  )
            iS = iS + 1
            cycle aa
          endif
        enddo
      enddo aa
    enddo
  end subroutine hecmw_matrix_copy_with_fillin

  subroutine hecmw_matrix_copy_with_fillin_asym_33(hecMESH, hecT, idxU, itemU, AU, NPU, idxL, itemL, AL, NPL)
    use hecmw_util
    implicit none
    type(hecmwST_local_mesh) :: hecMESH
    type(hecmwST_matrix) :: hecT
    integer(kind=kint), pointer :: idxU(:)
    integer(kind=kint), pointer :: itemU(:)
    integer(kind=kint), pointer :: idxL(:)
    integer(kind=kint), pointer :: itemL(:)
    real(kind=kreal), pointer :: AU(:)
    real(kind=kreal), pointer :: AL(:)
    integer(kind=kint) :: N, NPU, NPL
    integer(kind=kint) :: i, j, k, iS, iE, jS, jE
    integer(kind=kint) :: in, jn ,kn, nn

    N   = hecT%N

    !value
    allocate(AU(9*NPU))
    allocate(AL(9*NPL))
    AU = 0.0d0
    AL = 0.0d0

    do k=1,N
      in = idxU(k-1)+1
      AU(9*in-8) = hecT%D(9*k-8)
      AU(9*in-7) = hecT%D(9*k-7)
      AU(9*in-6) = hecT%D(9*k-6)
      AU(9*in-5) = hecT%D(9*k-5)
      AU(9*in-4) = hecT%D(9*k-4)
      AU(9*in-3) = hecT%D(9*k-3)
      AU(9*in-2) = hecT%D(9*k-2)
      AU(9*in-1) = hecT%D(9*k-1)
      AU(9*in  ) = hecT%D(9*k  )
    enddo

    do k=1,N
      iS = idxU(k-1) + 1
      iE = idxU(k)
      jS = hecT%indexU(k-1) + 1
      jE = hecT%indexU(k)
      aa:do j=jS,jE
        jn = hecT%itemU(j)
        do i=iS,iE
          in = itemU(i)
          if(jn == in)then
            AU(9*i-8) = hecT%AU(9*j-8)
            AU(9*i-7) = hecT%AU(9*j-7)
            AU(9*i-6) = hecT%AU(9*j-6)
            AU(9*i-5) = hecT%AU(9*j-5)
            AU(9*i-4) = hecT%AU(9*j-4)
            AU(9*i-3) = hecT%AU(9*j-3)
            AU(9*i-2) = hecT%AU(9*j-2)
            AU(9*i-1) = hecT%AU(9*j-1)
            AU(9*i  ) = hecT%AU(9*j  )
            iS = iS + 1
            cycle aa
          endif
        enddo
      enddo aa
    enddo

    do k=1,N
      iS = idxL(k-1) + 1
      iE = idxL(k)
      jS = hecT%indexL(k-1) + 1
      jE = hecT%indexL(k)
      bb:do j=jS,jE
        jn = hecT%itemL(j)
        do i=iS,iE
          in = itemL(i)
          if(jn == in)then
            AL(9*i-8) = hecT%AL(9*j-8)
            AL(9*i-7) = hecT%AL(9*j-7)
            AL(9*i-6) = hecT%AL(9*j-6)
            AL(9*i-5) = hecT%AL(9*j-5)
            AL(9*i-4) = hecT%AL(9*j-4)
            AL(9*i-3) = hecT%AL(9*j-3)
            AL(9*i-2) = hecT%AL(9*j-2)
            AL(9*i-1) = hecT%AL(9*j-1)
            AL(9*i  ) = hecT%AL(9*j  )
            iS = iS + 1
            cycle bb
          endif
        enddo
      enddo bb
    enddo

  end subroutine hecmw_matrix_copy_with_fillin_asym_33

  subroutine hecmw_matrix_copy_with_fillin_asym_44(hecMESH, hecT, idxU, itemU, AU, NPU, idxL, itemL, AL, NPL)
    use hecmw_util
    implicit none
    type(hecmwST_local_mesh) :: hecMESH
    type(hecmwST_matrix) :: hecT
    integer(kind=kint), pointer :: idxU(:)
    integer(kind=kint), pointer :: itemU(:)
    integer(kind=kint), pointer :: idxL(:)
    integer(kind=kint), pointer :: itemL(:)
    real(kind=kreal), pointer :: AU(:)
    real(kind=kreal), pointer :: AL(:)
    integer(kind=kint) :: N, NPU, NPL
    integer(kind=kint) :: i, j, k, iS, iE, jS, jE
    integer(kind=kint) :: in, jn ,kn, nn

    N   = hecT%N

    !value
    allocate(AU(16*NPU))
    allocate(AL(16*NPL))
    AU = 0.0d0
    AL = 0.0d0

    do k=1,N
      in = idxU(k-1)+1
      AU(16*in-15) = hecT%D(16*k-15)
      AU(16*in-14) = hecT%D(16*k-14)
      AU(16*in-13) = hecT%D(16*k-13)
      AU(16*in-12) = hecT%D(16*k-12)
      AU(16*in-11) = hecT%D(16*k-11)
      AU(16*in-10) = hecT%D(16*k-10)
      AU(16*in- 9) = hecT%D(16*k- 9)
      AU(16*in- 8) = hecT%D(16*k- 8)
      AU(16*in- 7) = hecT%D(16*k- 7)
      AU(16*in- 6) = hecT%D(16*k- 6)
      AU(16*in- 5) = hecT%D(16*k- 5)
      AU(16*in- 4) = hecT%D(16*k- 4)
      AU(16*in- 3) = hecT%D(16*k- 3)
      AU(16*in- 2) = hecT%D(16*k- 2)
      AU(16*in- 1) = hecT%D(16*k- 1)
      AU(16*in   ) = hecT%D(16*k   )
    enddo

    do k=1,N
      iS = idxU(k-1) + 1
      iE = idxU(k)
      jS = hecT%indexU(k-1) + 1
      jE = hecT%indexU(k)
      aa:do j=jS,jE
        jn = hecT%itemU(j)
        do i=iS,iE
          in = itemU(i)
          if(jn == in)then
            AU(16*i-15) = hecT%AU(16*j-15)
            AU(16*i-14) = hecT%AU(16*j-14)
            AU(16*i-13) = hecT%AU(16*j-13)
            AU(16*i-12) = hecT%AU(16*j-12)
            AU(16*i-11) = hecT%AU(16*j-11)
            AU(16*i-10) = hecT%AU(16*j-10)
            AU(16*i- 9) = hecT%AU(16*j- 9)
            AU(16*i- 8) = hecT%AU(16*j- 8)
            AU(16*i- 7) = hecT%AU(16*j- 7)
            AU(16*i- 6) = hecT%AU(16*j- 6)
            AU(16*i- 5) = hecT%AU(16*j- 5)
            AU(16*i- 4) = hecT%AU(16*j- 4)
            AU(16*i- 3) = hecT%AU(16*j- 3)
            AU(16*i- 2) = hecT%AU(16*j- 2)
            AU(16*i- 1) = hecT%AU(16*j- 1)
            AU(16*i   ) = hecT%AU(16*j   )
            iS = iS + 1
            cycle aa
          endif
        enddo
      enddo aa
    enddo

    do k=1,N
      iS = idxL(k-1) + 1
      iE = idxL(k)
      jS = hecT%indexL(k-1) + 1
      jE = hecT%indexL(k)
      bb:do j=jS,jE
        jn = hecT%itemL(j)
        do i=iS,iE
          in = itemL(i)
          if(jn == in)then
            AL(16*i-15) = hecT%AL(16*j-15)
            AL(16*i-14) = hecT%AL(16*j-14)
            AL(16*i-13) = hecT%AL(16*j-13)
            AL(16*i-12) = hecT%AL(16*j-12)
            AL(16*i-11) = hecT%AL(16*j-11)
            AL(16*i-10) = hecT%AL(16*j-10)
            AL(16*i- 9) = hecT%AL(16*j- 9)
            AL(16*i- 8) = hecT%AL(16*j- 8)
            AL(16*i- 7) = hecT%AL(16*j- 7)
            AL(16*i- 6) = hecT%AL(16*j- 6)
            AL(16*i- 5) = hecT%AL(16*j- 5)
            AL(16*i- 4) = hecT%AL(16*j- 4)
            AL(16*i- 3) = hecT%AL(16*j- 3)
            AL(16*i- 2) = hecT%AL(16*j- 2)
            AL(16*i- 1) = hecT%AL(16*j- 1)
            AL(16*i   ) = hecT%AL(16*j   )
            iS = iS + 1
            cycle bb
          endif
        enddo
      enddo bb
    enddo

  end subroutine hecmw_matrix_copy_with_fillin_asym_44

  subroutine reallocate_array(F, num, x)
    implicit none
    type(hecmw_fill) :: F
    integer(kind=kint) :: n, i, num, x(:)
    integer(kind=kint), pointer :: tmp(:)

    n = F%N
    tmp => F%node
    F%node => null()
    allocate(F%node(n+num))
    do i=1,n
      F%node(i) = tmp(i)
    enddo
    do i=1,num
      F%node(n+i) = x(i)
    enddo
    F%N = F%N + num
    deallocate(tmp)
  end subroutine reallocate_array

end module m_hecmw_matrix_get_fillin
