#' This function provides a fake sample data set which has the form of the German FSS data. It is important to verify before if the national RDC provides the data
#' in the form as desired by this function to have a proper fake data set.
#'
#' This function is written for the German Farm Structure Survey data as it was provided in the year 2021.This means, that the generated data will be in a form that fits
#' to the variables that are used from 2010 onwards - C0 codes.
#'
#' In its basic form this function generates data for the years 1999, 2003, 2007, 2010, 2013, 2016 and 2020. For 2013, it is only a sample of the population. The automatically
#' generated variables comprise 4 regional variables, 7 general variables and 7 production based variables.
#'
#' \emph{Regional variables}:
#' \itemize{
#'   \item C0010U1: NUTS1
#'   \item C0010UG5: NUTS2
#'   \item C0010UG4: NUTS3
#'   \item AGS: LAU
#' }
#' The regional variables are reasonable, but far away from correct numbers.
#'
#' \emph{General variables}:
#' \itemize{
#'  \item C0008U1: year of survey
#'  \item nr: farm id
#'  \item C0072: weighting factor - generated also for non-sample farms - only relevant for sample farms - weighted sum of a specific variable does not lead to the population sum!
#'  \item C0025: "N" population or "S" sample farm
#'  \item C0041: legal status - single farm, unincorporate farm (both as private farm) and corporate farm
#'  \item C0045: 1 full-time farm, 2 part-time farm, NA neither
#'  \item C0060UG1: farm type - aggregated to some relevant farm types in Germany
#' }
#' \emph{Production variables}:
#' \itemize{
#'  \item C0240: total utilized agricultural area
#'  \item C0231, C0232, C0233, C0234: grass land activities
#'  \item C0210: arable land
#'  }
#' These variables are coherent as grass land and arable land sum up to total land.
#'
#' Any additional variables provided via C0codes argument are not coherent to these production variables.
#'
#' @description
#' This function provides a fake sample data set which has the form of the German FSS data.
#'
#'@author Sebastian Neuenfeldt
#'
#'
#' @param nobs Number of observations approximately to be generated
#'
#' @param years Years of survey
#'
#' @param C0codes Optional variables to be generated, only meaningful for continues variables.
#'
#' @return
#' Retruns a fake data set based on German FSS data.
#'
#'@import
#'
#' @export
#'
#' @examples \dontrun{
#' FSS_data_DE <- generateFakeFSSData_DE()
#' }
generateFakeFSSData_DE <- function(nobs=270000,years=c(1999,2003,2007,2010,2013,2016,2020),C0codes=NULL){

  cat("\n Build fake data  \n")
  N <- nobs*length(years)
  if(is.null(C0codes)){
    data_df <- matrix(nrow = N,ncol = length(c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1")))
    data_df <- data.frame(data_df)
    colnames(data_df) <- c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1")
  } else {
    data_df <- matrix(nrow = N,ncol = length(c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1",
                                                             C0codes)))
    data_df <- data.frame(data_df)
    colnames(data_df) <- c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1",
                                       C0codes)
  }

  ## set years, farm id's, federal states (NUTS1), NUTS2, NUTS3, LAU, legal form, socio-economic type, a weighting factor (but for all farms)
  set.seed(1234)
  data_df$C0008U1 <- sample(years,size = N, replace = TRUE, prob = c(60,58,56,54,53,52,50)[1:length(years)])
  data_df$nr <- 1:N
  data_df$C0010U1 <- as.character(sample(c(1:16),size = N, replace = TRUE, prob = c(10,1,25,1,35,15,12,30,30,10,1,20,15,18,12,10)))
  data_df$C0010U1 <- ifelse(nchar(data_df$C0010U1)==1,paste0("0",data_df$C0010U1),data_df$C0010U1)
  data_df$C0010UG5 <- as.character(paste0(data_df$C0010U1,0))
  ## Adjust NUTS2 Baden-Württemberg (4), Bayern (7), Hessen (3) und Nordrhein-Westfalen (5)
  data_df$C0010UG5[data_df$C0010U1==5] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==5],#
                                                              sample(c(1,3,5,7,9),
                                                                     size = length(data_df$C0010U1[data_df$C0010U1==5]),
                                                                     replace = TRUE,
                                                                     prob = c(1,1,1,1,1))))
  data_df$C0010UG5[data_df$C0010U1==3] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==3],#
                                                              sample(c(1,2,3,4),
                                                                     size = length(data_df$C0010U1[data_df$C0010U1==3]),
                                                                     replace = TRUE,
                                                                     prob = c(1,1,1,1))))
  data_df$C0010UG5[data_df$C0010U1==6] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==6],#
                                                              sample(c(1,2,3),
                                                                     size = length(data_df$C0010U1[data_df$C0010U1==6]),
                                                                     replace = TRUE,
                                                                     prob = c(1,1,1))))
  data_df$C0010UG5[data_df$C0010U1==8] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==8],#
                                                              sample(c(1,2,3,4),
                                                                     size = length(data_df$C0010U1[data_df$C0010U1==8]),
                                                                     replace = TRUE,
                                                                     prob = c(1,1,1,1))))
  data_df$C0010UG5[data_df$C0010U1==9] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==9],#
                                                              sample(c(1,2,3,4,5,6,7),
                                                                     size = length(data_df$C0010U1[data_df$C0010U1==9]),
                                                                     replace = TRUE,
                                                                     prob = c(1,1,1,1,1,1,1))))
  ## Generate NUTS3 - which has not correct numbers
  data_df$C0010UG4 <- paste0(data_df$C0010UG5,sample(c(51:71),size = N, replace = TRUE))
  ## Generate AGS (LAU) - which has not correct numbers
  data_df$AGS <- paste0(data_df$C0010UG4,sample(c(101:131),size = N, replace = TRUE))

  data_df$C0041 <- sample(c(1:3),size = N, replace = TRUE, prob = c(8,2,1))
  data_df$C0045 <- sample(c(1,2,NA),size = N, replace = TRUE, prob = c(7,2,1))
  data_df$C0072 <- sample(seq(from=1,to=6,by=0.1),size=N,replace=TRUE)
  data_df$C0025 <- "S"
  head(data_df)

  ## Randomly select farms as belonging to the population ("N") or sample ("S") - there are additional variables provided for sample farms
  for(i in unique(data_df$C0008U1)){
    data_df[data_df$C0008U1==i,"C0025"] <- sample(c("N","S"),size = nrow(data_df[data_df$C0008U1==i,]),prob = c(9,3.6),replace = TRUE)
  }
  ## As in 2013 there are only sample farms, delete "N" entries from the data
  data_df <- data_df[!(data_df$C0025=="N" & data_df$C0008U1==2013),]

  ## Randomly give type of farming to farms
  for(i in unique(data_df$C0008U1)){
    data_df[data_df$C0008U1==i,"C0060UG1"] <- sample(c("15","16","2","35","36_38","45","46","47","48","51","52_53","6","7","83","84"),
                                                                        size = nrow(data_df[data_df$C0008U1==i,]),
                                                                        replace = TRUE, prob = c(20,11,2,3,4,16,9,8,7,11,7,8,7,8,7))
  }
  ## Aggregate type of farms to some main groups
  data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("15","16"),"1",data_df$C0060UG1)
  data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("2","35","36_38"),"2_3",data_df$C0060UG1)
  data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("46","47","48"),"46_48",data_df$C0060UG1)
  data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("51","52_53"),"5",data_df$C0060UG1)
  data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("6","7","83","84"),"6_7_8",data_df$C0060UG1)

  ## Generate some import aggregate activities
  ##  - C0240: total utilized agricultural area
  ##  - C0231, C0232, C0233, C0234: grass land activities
  ##  - C0210: arable land
  data_df[,"C0240"] <- rnorm(n=nrow(data_df),mean=(70+rnorm(n=1,mean=3,sd=3)),sd=40)
  # data_df[,i] <- rcauchy(N, location=10.262, scale=15.102)
  data_df[,"C0240"] <- ifelse(data_df[,"C0240"]<0,0,data_df[,"C0240"])

  ## Grünland
  data_df[,"C0231"] <- data_df[,"C0240"]*0.3
  data_df[,"C0232"] <- data_df[,"C0240"]*0.1
  data_df[,"C0233"] <- data_df[,"C0240"]*0.05
  data_df[,"C0234"] <- data_df[,"C0240"]*0.02
  ## arable land - residual of C0240 and C0231-C0234
  data_df[,"C0210"] <- data_df[,"C0240"] - data_df[,"C0231"] - data_df[,"C0232"] - data_df[,"C0233"] - data_df[,"C0234"]

  ## Randomly generate hectares or herd size (does not matter for fake data) - not for total utilized agricultural area, arable land and grass land activities
  for(i in C0codes[!C0codes %in% c("C0240","C0210","C0231","C0232","C0233","C0234")]){
    if(is.null(i)){
      cat("No additional variables generated")
      next
    }
    data_df[,i] <- rnorm(n=nrow(data_df),mean=(10+rnorm(n=1,mean=3,sd=3)),sd=10)
    ## adjust negative values
    data_df[,i] <- ifelse(data_df[,i]<0,0,data_df[,i])
    ## adjust too large values
    data_df[,i] <- ifelse(data_df[,i]>3*sd(data_df[,i]),0,data_df[,i])
  }


  return(data_df)

}