#' @importFrom data.table :=
#' @importFrom dplyr between bind_cols bind_rows filter left_join rename select slice_sample
#' @importFrom magrittr %>%
#' @importFrom rlang sym !!
NULL

#' Create many-to-one pairs, when there are existing households
#'
#' Creates a data frame of many-to-one pairs, based on a distribution of age differences. Designed to match multiple children to the same parent, the function can be used for any situation where a many-to-one match is required based on a range of age differences. For clarity and brevity, the terms "children" and "parents" will be used.
#' Two data frames are required: one for children and one for potential parents. The data frame of potential parents must contain household identifiers
#' The minimum and maximum ages of parents must be specified. This ensures that there are no parents who were too young (e.g. 11 years) or too old (e.g. 70 years) at the time the child was born. The presence of too young and too old parents is tested throughout this function. Thus, pre-cleaning the parents data frame is not required.
#' Both data frames must be restricted to only those people that will be paired.
#'
#' @export
#' @param children The data frame containing the children to be paired with a parent/guardian.
#' @param chlid The variable containing the unique ID for each person,in the children data frame.
#' @param chlage The age variable, in the children data frame.
#' @param numchild The number of children that are required in each household.
#' @param twinprob The probability that a person is a twin.
#' @param parents The data frame containing the potential parents.(This data frame must contain at least the same number of observations as the children data frame.)
#' @param parid The variable containing the unique ID for each person,in the parents data frame.
#' @param parage The age variable, in the parent data frame.
#' @param minparage The youngest age at which a person becomes a parent. The default value is NULL, which will cause the function to stop.
#' @param maxparage The oldest age at which a person becomes a parent. The default value is NULL, which will cause the function to stop.
#' @param HHNumVar The name of the household identifier variable in the parents data frame.
#' @param userseed If specified, this will set the seed to the number provided. If not, the normal set.seed() function will be used.
#' @param maxdiff The maximum age difference for the children in a household ages. This is applied to the first child randomly selected for the household, so overall age differences may be 2* maxdiff. Default value is no constraints on child age differences in the household.
#'
#' @return A list of three  data frames. $Matched contains the data frame of child-parent matches. $Adults contains any unmatched observations from the parents data frame. $Children contains any unmatched observations from the children data frame. $Adults and/or $Children may be empty data frames.
#'
#' @examples
#'
#' library(dplyr)
#'
#' set.seed(1)
#' Parents <- Township %>%
#'   filter(Relationship == "Partnered", Age > 18) %>%
#'   slice_sample(n = 500) %>%
#'   mutate(Household = row_number())
#' Children <- Township %>%
#'   filter(Relationship == "NonPartnered", Age < 20) %>%
#'   slice_sample(n = 400)
#'
#' # example with assigning two children to a parent
#' # the same number of children is assigned to all parents
#' # adding two children to each parent
#'
#' ChildMatched <- pairmultNum(Children, chlid = "ID", chlage = "Age", numchild = 2, twinprob = 0.03,
#'                             Parents, parid = "ID", parage = "Age", minparage = 18, maxparage = 54,
#'                             HHNumVar = "Household", userseed =4, maxdiff = 3)
#' MatchedFamilies <- ChildMatched$Matched
#' UnmatchedChildren <- ChildMatched$Children
#' UnmatchedAdults <- ChildMatched$Adults

pairmultNum <- function(children, chlid, chlage, numchild = 2, twinprob = 0, parents, parid, parage, minparage = NULL,
                        maxparage = NULL, HHNumVar = NULL, userseed=NULL, maxdiff=1000)

{

  withr::local_options(dplyr.summarise.inform = FALSE)

  # content check
  # child dataframe
  if (!chlid %in% names(children)) {
    stop("The ID variable in the children data frame does not exist.")
  }

  if (!chlage %in% names(children)) {
    stop("The age variable in the children data frame does not exist.")
  }

  # parent dataframe
  if (!parid %in% names(parents)) {
    stop("The ID variable in the parents data frame does not exist.")
  }

  if (!parage %in% names(parents)) {
    stop("The age variable in the parents data frame does not exist.")
  }

  if (is.null(minparage)) {
    stop("The minimum parent age must be supplied.")
  }

  if (is.null(maxparage)) {
    stop("The maximum parent age must be supplied.")
  }

  if(is.null(HHNumVar)) {
    stop("A name for the household count variable must be supplied.")
  }

  #####################################
  #####################################
  # get column names as symbols to use inside data frame subfunctions
  #####################################
  #####################################
  # smalldf ID variable
  chidcolName <- sym(names(children[chlid]))
  # smalldf age variable
  chagecolName <- sym(names(children[chlage]))

  # largedf ID variable
  paridcolName <- sym(names(parents[parid]))
  # largedf age variable
  paragecolName <- sym(names(parents[parage]))

  #####################################
  #####################################
  # end column names
  #####################################
  #####################################

  # more testing

  if (!any(duplicated(children[[chidcolName]])) == FALSE) {
    stop("The ID variable in the children data frame has duplicated values.")
  }

  if (!any(duplicated(parents[[paridcolName]])) == FALSE) {
    stop("The ID variable in the parents data frame has duplicated values.")
  }

  if (!is.numeric(children[[chagecolName]])) {
    stop("The age variable in the children data frame is not numeric.")
  }

  if (!is.numeric(parents[[paragecolName]])) {
    stop("The age variable in the parents data frame is not numeric.")
  }



  # create the internal data frames
  childrenRenamed <- children %>%
    rename(ChildID = !! chidcolName,
           ChildAge = !! chagecolName)

  parentsRenamed <- parents %>%
    rename(ParentID = !! paridcolName,
           ParentAge = !! paragecolName,
           internalHHID = !! HHNumVar) %>%
    ungroup()

  # give info on expected number of twin households

  ExpctNumHHTwins <- round(nrow(childrenRenamed)*twinprob)
  ExpctTotalHH <- nrow(childrenRenamed) / numchild

  # sort this out if the parent data frame is much smaller

  if(nrow(parentsRenamed) < ExpctTotalHH) {

    ExpctProbPerHH <- ExpctNumHHTwins/ ExpctTotalHH
    ExpctNumHH <- round(ExpctProbPerHH * ExpctTotalHH)

  } else {

    ExpctNumHH <- ExpctNumHHTwins

  }

  ExpMaxHH <- nrow(parentsRenamed)
  ExpNumParTwins <- ExpMaxHH * (twinprob/2)

  if(ExpctNumHH > ExpNumParTwins) {

    ExpctNumHH <- round(ExpNumParTwins,0)
  }

  # seed must come before first sample is cut
  if (!is.null(userseed)) {
    set.seed(userseed)
  }


  #####################################
  #####################################
  # end set up
  #####################################
  #####################################

  #####################################
  #####################################
  # Functions for twins and their siblings
  #####################################
  #####################################


  #####################################
  #####################################
  # Split into twins and non-twins
  #####################################
  #####################################

  if (ExpctNumHH > 0 & numchild == 2) {

    # restrict child dataframe to those of which there are at least two children, cannot twin if there isn't a second child
    # make sure the minimum and maximum ages have enough other child ages to meet the numkids and maxdiff requirements

    AgesForTwins <- childrenRenamed %>%
      group_by(.data$ChildAge) %>%
      summarise(NumAge = n()) %>%
      filter(.data$NumAge > 1)

    # select twin ages

    for (i in 1:ExpctNumHH) {


      # get age, need to select randomly

      TwinAgeSelect <- AgesForTwins %>%
        slice_sample(n=1, weight_by = .data$NumAge)

      CurrentAge <- TwinAgeSelect$ChildAge

      # pull children from the child dataframe who are that age
      SelectedKids <- childrenRenamed %>%
        filter(.data$ChildAge==CurrentAge) %>%
        slice_sample(n=2, replace = FALSE)

      # get parent
      NeededMin <- minparage + CurrentAge
      NeededMax <- maxparage + CurrentAge


      SelectedParent <- parentsRenamed %>%
        filter(between(.data$ParentAge, NeededMin, NeededMax)) %>%
        slice_sample(n=1)

      currentHHID <- SelectedParent$internalHHID


      SelectedKids <- SelectedKids %>%
        mutate(internalHHID = currentHHID)

      # add these to a file that will continue to be appended
      # make sure that the household ID is added.
      # separate the parent and child files at this point as the column names are different

      if(exists("ChildAgeMatch")) {

        ChildAgeMatch <- bind_rows(ChildAgeMatch, SelectedKids)
        ParentAgeMatch <- bind_rows(ParentAgeMatch, SelectedParent)

      } else {

        ChildAgeMatch <- SelectedKids
        ParentAgeMatch <- SelectedParent


      }

      # remove the children from the available children and update the counts to enter into the loop
      childrenRenamed <- childrenRenamed %>%
        filter(!(.data$ChildID %in% c(SelectedKids$ChildID)))

      AgesForTwins <- childrenRenamed %>%
        group_by(.data$ChildAge) %>%
        summarise(NumAge = n()) %>%
        filter(.data$NumAge > 1)

      # remove the parent

      parentsRenamed <- parentsRenamed %>%
        filter(!(.data$ParentID %in% c(SelectedParent$ParentID)))


      # have to add the additional twin stuff in here, need to check if children needed > 0

      # closes for (i in 1:ExpctNumHH) {
    }

    # closes if (ExpctNumHH > 0 & numchild == 2) {
  }


  #####################################
  #####################################
  # Loop if twin rate > 0 and numkids > 2
  #####################################
  #####################################

  if (ExpctNumHH > 0 & numchild > 2) {

    # restrict child dataframe to those of which there are at least two children, cannot twin if there isn't a second child

    NumAddtionalKids <- numchild - 2

    MinChildrenRenamedAge <- min(childrenRenamed$ChildAge)
    MaxChildrenRenamedAge <- max(childrenRenamed$ChildAge)

    AgesForTwins <- childrenRenamed %>%
      group_by(.data$ChildAge) %>%
      summarise(NumAge = n()) %>%
      filter(.data$NumAge > 1)


    # select twin ages

    for (i in 1:ExpctNumHH) {


      # get age, need to select randomly

      # TODO loop to make sure that twin age is reasonable given number of kids needed

      TwinAgeSelect <- AgesForTwins %>%
        slice_sample(n=1, weight_by = .data$NumAge)

      CurrentAge <- TwinAgeSelect$ChildAge


      # pull children from the child dataframe who are that age
      SelectedKids <- childrenRenamed %>%
        filter(.data$ChildAge==CurrentAge) %>%
        slice_sample(n=2, replace = FALSE)

      # get parent
      NeededMin <- minparage + CurrentAge
      NeededMax <- maxparage + CurrentAge

      SelectedParent <- parentsRenamed %>%
        filter(between(.data$ParentAge, NeededMin, NeededMax)) %>%
        slice_sample(n=1)

      currentHHID <- SelectedParent$internalHHID


      SelectedKids <- SelectedKids %>%
        mutate(internalHHID = currentHHID)

      # find the kids that also need to be added


      # find another n = NumAdditionalKids children

      childrenRenamed <- childrenRenamed %>%
        filter(!(.data$ChildID %in% c(SelectedKids$ChildID)))

      # filter out children same age, plus children too old or young to be added

      AddChildMinAge <- max(0, CurrentAge-maxdiff)
      AddChildMaxAge <- CurrentAge+maxdiff

      # shortlist based on the age of the twins
      AgesForExtraChildren <- childrenRenamed %>%
        group_by(.data$ChildAge) %>%
        filter(!(.data$ChildAge==CurrentAge),
               between(.data$ChildAge, AddChildMinAge, AddChildMaxAge))

      # short list based on the age of the parent
      # need the possible child age range for the selected parent
      # looks weird, but the smallest number is when you deduct the largest age gap
      # and the largest number is when you deduct the smallest age gap
      MinChildAgeToAdd <- SelectedParent$ParentAge - maxparage
      MaxChildAgeToAdd <- SelectedParent$ParentAge - minparage



      AgesForExtraChildren <- AgesForExtraChildren %>%
        filter(between(.data$ChildAge, MinChildAgeToAdd, MaxChildAgeToAdd)) %>%
        group_by(.data$ChildAge) %>%
        summarise(NumAge = n())

      # sample NumAddtionalKids rows

      NumExtraAges <- nrow(AgesForExtraChildren)


      if(NumExtraAges >= NumAddtionalKids) {


        ChildAgesChosen <- AgesForExtraChildren %>%
          slice_sample(n= NumAddtionalKids, replace = FALSE)


        # randomly select a child of each age in the ChildAgesChosen data frame
        for(m in 1:NumAddtionalKids) {

          NewChildToAdd <- ChildAgesChosen[m,]
          NewChildAge <- NewChildToAdd$ChildAge

          NewChild <- childrenRenamed %>%
            filter(.data$ChildAge == NewChildAge) %>%
            slice_sample(n=1) %>%
            mutate(internalHHID = currentHHID)


          if(exists("DataframeAddedKids")) {

            DataframeAddedKids <- bind_rows(DataframeAddedKids, NewChild)

            # closes if(exists("DataframeAddedKids")) {
          } else {

            DataframeAddedKids <- NewChild

            # closes else to if(exists("DataframeAddedKids")) {
          }

          # closes for(m in 1:NumAddtionalKids) {
        }


        # randomly select the additional children


        # closes loop for if(NumExtraAges >= NumAddtionalKids) {

        SelectedKids <- bind_rows(SelectedKids, DataframeAddedKids)

        rm(DataframeAddedKids)

        # add these to a file that will continue to be appended
        # make sure that the household ID is added.
        # separate the parent and child files at this point as the column names are different

        if(exists("ChildAgeMatch")) {

          ChildAgeMatch <- bind_rows(ChildAgeMatch, SelectedKids)
          ParentAgeMatch <- bind_rows(ParentAgeMatch, SelectedParent)


        } else {

          ChildAgeMatch <- SelectedKids
          ParentAgeMatch <- SelectedParent



        }

        # remove the children from the available children and update the counts to enter into the loop
        childrenRenamed <- childrenRenamed %>%
          filter(!(.data$ChildID %in% c(SelectedKids$ChildID)))

        AgesForTwins <- childrenRenamed %>%
          group_by(.data$ChildAge) %>%
          summarise(NumAge = n()) %>%
          filter(.data$NumAge > 1)


        # remove the parent

        parentsRenamed <- parentsRenamed %>%
          filter(!(.data$ParentID %in% c(SelectedParent$ParentID)))


      } else {

        # NOT ENOUGH KIDS TO ADD, MUST REJECT THE TWIN SELECTION
        # TODO if this becomes a problem, but twins are matched first
        # not sure if this is going to work

        stop("Twins have no matching siblings \n")
#
#         # remove problem twin age
#         AgesForTwins <- childrenRenamed %>%
#           filter(.data$ChildAge == CurrentAge)
#         group_by(.data$ChildAge) %>%
#           summarise(NumAge = n()) %>%
#           filter(.data$NumAge > 1)
#
#         # add selected children back into the childrenRenamed dataframe
#         # who were removed
#         SelectedKids <- SelectedKids %>%
#           select(- "internalHHID")
#
#         childrenRenamed <- bind_rows(childrenRenamed, SelectedKids)

        # closes else to for if(NumExtraAges >= NumAddtionalKids) {
      }

      # have to add the additional twin stuff in here, need to check if children needed > 0

      # closes for (i in 1:ExpctNumHH) {
    }

    # closes if (ExpctNumHH > 0 & numchild > 2) {

  }






  #####################################
  #####################################
  # Create the non-twin families
  #####################################
  #####################################
  # work through the child dataset as these are the ones that need to be used

  # may not be a modulo, so check before each iteration
  # just need to see that the number of people available in the child dataset is >= number needed
  # Counter = 0

  # MaxCounter = 10*(nrow(childrenRenamed))

  while(nrow(childrenRenamed) >= numchild & nrow(parentsRenamed) > 0) {


    # grab a child randomly
    SelectedFirstChild <- childrenRenamed %>%
      slice_sample(n=1, replace = FALSE)

    CurrentAge <- SelectedFirstChild$ChildAge


    # get parent
    NeededMin <- minparage + CurrentAge
    NeededMax <- maxparage + CurrentAge


    SelectedParent <- parentsRenamed %>%
      filter(between(.data$ParentAge, NeededMin, NeededMax)) %>%
      slice_sample(n=1)

    # only do matching if nrow(SelectedParent) > 0
    # if it is zero, no parent selected


    if(nrow(SelectedParent) > 0) {

      currentHHID <- SelectedParent$internalHHID

      # get kids by creating a dataframe of ages available


      MinChildAge <- SelectedParent$ParentAge - maxparage
      MaxChildAge <- SelectedParent$ParentAge - minparage


      SelectedNextChildAges <- childrenRenamed %>%
        filter(!(.data$ChildID == SelectedFirstChild$ChildID),
               !(.data$ChildAge == SelectedFirstChild$ChildAge),
               between(.data$ChildAge, MinChildAge, MaxChildAge)) %>%
        group_by(.data$ChildAge) %>%
        summarise(NumAge = n()) %>%
        filter(.data$NumAge > 0)


      # test if there needs to be a distribution restriction on the child ages
      # NULL is the default, no constraints

      if(is.null(maxdiff)) {

        #####################################
        # only sample if nrow(SelectedNextChildAges) has at least as many rows as numchild-1
        # otherwise there aren't enough kids to sample
        # sample without replacement
        #####################################

        if(nrow(SelectedNextChildAges) >= (numchild - 1)) {

          # remove selected child from children dataframe
          childrenRenamed <- childrenRenamed %>%
            filter(!(.data$ChildID %in% c(SelectedFirstChild$ChildID)))


          # just sample ages with no restriction
          SampledChildAges <- SelectedNextChildAges %>%
            slice_sample(n=(numchild-1), weight_by = .data$NumAge, replace = FALSE)

          # get child

          for(l in 1:nrow(SampledChildAges)) {

            SampledAgeRow <- SampledChildAges[l,]

            SampledAge <- SampledAgeRow$ChildAge

            SelectedKids <- childrenRenamed %>%
              filter(.data$ChildAge == SampledAge) %>%
              slice_sample(n=1) %>%
              mutate(internalHHID = currentHHID)

            # remove from childrenRenamed

            childrenRenamed <- childrenRenamed %>%
              filter(!(.data$ChildID %in% c(SelectedKids$ChildID)))

            # create an interim dataset of children to append to working dataset

            if(exists("ChildAgeMatch")) {

              ChildAgeMatch <- bind_rows(ChildAgeMatch, SelectedKids)
              ParentAgeMatch <- bind_rows(ParentAgeMatch, SelectedParent)


              parentsRenamed <- parentsRenamed %>%
                filter(!(.data$ParentID %in% c(SelectedParent$ParentID)))

              # closes if(exists("ChildAgeMatch")) {
            } else {

              ChildAgeMatch <- SelectedKids
              ParentAgeMatch <- SelectedParent


              parentsRenamed <- parentsRenamed %>%
                filter(!(.data$ParentID %in% c(SelectedParent$ParentID)))

              # closes else to if(exists("ChildAgeMatch")) {
            }

            # closes for(l in 1:nrow(SampledChildAges)) {
          }

          # bind first selected child
          # append first child onto the selected kids

          SelectedFirstChild <- SelectedFirstChild %>%
            mutate(internalHHID = currentHHID)

          ChildAgeMatch <- bind_rows(ChildAgeMatch, SelectedFirstChild)




          # closes if(nrow(SelectedNextChildAges) > numchild-1) {
        } else {

          #####################################
          # need to remove unmatchable child from the childrenRenamed data frame
          #####################################

          childrenRenamed <- childrenRenamed %>%
            filter(!(.data$ChildID %in% c(SelectedFirstChild$ChildID)))

          # closes else to if(nrow(SelectedNextChildAges) > numchild-1) {
        }



        # closes if(isNULL(maxdiff)) {
      } else {

        #####################################
        # used if there is a standard deviation for the child ages
        # only sample if nrow(SelectedNextChildAges) has at least as many rows as numchild-1
        # otherwise there aren't enough kids to sample
        # sample without replacement
        #####################################

        # further filter the selected children based on the maxdiff value

        # limit extra children to those permitted by the maxdiff value
        NeededMinExtra <- CurrentAge - maxdiff
        NeededMaxExtra <- CurrentAge + maxdiff

        SelectedNextChildAges <- SelectedNextChildAges %>%
          filter(between(.data$ChildAge, NeededMinExtra, NeededMaxExtra))



        if(nrow(SelectedNextChildAges) >= (numchild - 1)) {

          # remove selected child from children dataframe
          childrenRenamed <- childrenRenamed %>%
            filter(!(.data$ChildID %in% c(SelectedFirstChild$ChildID)))

          # just sample ages with no restriction
          SampledChildAges <- SelectedNextChildAges %>%
            slice_sample(n=(numchild-1), weight_by = .data$NumAge, replace = FALSE)

          # get child

          for(l in 1:nrow(SampledChildAges)) {

            SampledAgeRow <- SampledChildAges[l,]

            SampledAge <- SampledAgeRow$ChildAge

            SelectedKids <- childrenRenamed %>%
              filter(.data$ChildAge == SampledAge) %>%
              slice_sample(n=1) %>%
              mutate(internalHHID = currentHHID)

            # remove from childrenRenamed

            childrenRenamed <- childrenRenamed %>%
              filter(!(.data$ChildID %in% c(SelectedKids$ChildID)))

            # create an interim dataset of children to append to working dataset

            if(exists("ChildAgeMatch")) {

              ChildAgeMatch <- bind_rows(ChildAgeMatch, SelectedKids)
              ParentAgeMatch <- bind_rows(ParentAgeMatch, SelectedParent)


              parentsRenamed <- parentsRenamed %>%
                filter(!(.data$ParentID %in% c(SelectedParent$ParentID)))

              # closes if(exists("ChildAgeMatch")) {
            } else {

              ChildAgeMatch <- SelectedKids
              ParentAgeMatch <- SelectedParent


              parentsRenamed <- parentsRenamed %>%
                filter(!(.data$ParentID %in% c(SelectedParent$ParentID)))

              # closes else to if(exists("ChildAgeMatch")) {
            }

            # closes for(l in 1:nrow(SampledChildAges)) {
          }

          # bind first selected child
          # append first child onto the selected kids

          SelectedFirstChild <- SelectedFirstChild %>%
            mutate(internalHHID = currentHHID)

          ChildAgeMatch <- bind_rows(ChildAgeMatch, SelectedFirstChild)


          # closes if(nrow(SelectedNextChildAges) > numchild-1) {
        } else {

          #####################################
          # need to remove unmatchable child from the childrenRenamed data frame


          childrenRenamed <- childrenRenamed %>%
            filter(!(.data$ChildID %in% c(SelectedFirstChild$ChildID)))

          # closes else to if(nrow(SelectedNextChildAges) > numchild-1) {
        }




        # closes else to if(is.null(maxdiff)) {
      }

      # closes if(nrow(SelectedParent) > 0)

    } else {
      # need an else in here
      # remove all ages of that child in the data frame
      # alternatively, remove child and see if that fixes it
      childrenRenamed <- childrenRenamed %>%
        filter(!(.data$ChildID %in% c(SelectedFirstChild$ChildID)))

    }

    # closes if(nrow(childrenRenamed) >= numchild)
  }

  #####################################
  #####################################
  # Create the output data frames
  #####################################
  #####################################


  FullMatchedChld <- ChildAgeMatch %>%
    rename(!! chidcolName := "ChildID",
           !! chagecolName := "ChildAge",
           {{HHNumVar}} := "internalHHID")


  FullMatchedPrnt <- ParentAgeMatch %>%
    rename(!! paridcolName := "ParentID",
           !! paragecolName := "ParentAge",
           {{HHNumVar}} := "internalHHID") %>%
    unique()


  OutputDataframe <- rbind(FullMatchedChld, FullMatchedPrnt)


  MatchedIDs <- OutputDataframe %>%
    pull({{paridcolName}})

  noParents <- children %>%
    filter(!({{chidcolName}} %in% MatchedIDs))


  noKids <- parents %>%
    filter(!({{paridcolName}} %in% MatchedIDs))

  MergedList <- list()

  MergedList$Matched <- OutputDataframe
  MergedList$Children <- noParents
  MergedList$Adults <- noKids

  return(MergedList)



  # closes function
}
