#'
#'@import
#'
#' @export
#'
#' @examples \dontrun{
#' FSS_data_DE <- generateFakeFSSData_DE()
#' }
generateFakeFSSData_DE <- function(nobs=270000,years=c(1999,2003,2007,2010,2013,2016,2020),C0codes=NULL){
cat("\n Build fake data \n")
N <- nobs*length(years)
if(is.null(C0codes)){
data_df <- matrix(nrow = N,ncol = length(c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1")))
data_df <- data.frame(data_df)
colnames(data_df) <- c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1")
} else {
data_df <- matrix(nrow = N,ncol = length(c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1",
C0codes)))
data_df <- data.frame(data_df)
colnames(data_df) <- c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1",
C0codes)
}
## set years, farm id's, federal states (NUTS1), NUTS2, NUTS3, LAU, legal form, socio-economic type, a weighting factor (but for all farms)
set.seed(1234)
data_df$C0008U1 <- sample(years,size = N, replace = TRUE, prob = c(60,58,56,54,53,52,50)[1:length(years)])
data_df$nr <- 1:N
data_df$C0010U1 <- as.character(sample(c(1:16),size = N, replace = TRUE, prob = c(10,1,25,1,35,15,12,30,30,10,1,20,15,18,12,10)))
data_df$C0010U1 <- ifelse(nchar(data_df$C0010U1)==1,paste0("0",data_df$C0010U1),data_df$C0010U1)
data_df$C0010UG5 <- as.character(paste0(data_df$C0010U1,0))
## Adjust NUTS2 Baden-Württemberg (4), Bayern (7), Hessen (3) und Nordrhein-Westfalen (5)
data_df$C0010UG5[data_df$C0010U1==5] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==5],#
sample(c(1,3,5,7,9),
size = length(data_df$C0010U1[data_df$C0010U1==5]),
replace = TRUE,
prob = c(1,1,1,1,1))))
data_df$C0010UG5[data_df$C0010U1==3] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==3],#
sample(c(1,2,3,4),
size = length(data_df$C0010U1[data_df$C0010U1==3]),
replace = TRUE,
prob = c(1,1,1,1))))
data_df$C0010UG5[data_df$C0010U1==6] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==6],#
sample(c(1,2,3),
size = length(data_df$C0010U1[data_df$C0010U1==6]),
replace = TRUE,
prob = c(1,1,1))))
data_df$C0010UG5[data_df$C0010U1==8] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==8],#
sample(c(1,2,3,4),
size = length(data_df$C0010U1[data_df$C0010U1==8]),
replace = TRUE,
prob = c(1,1,1,1))))
data_df$C0010UG5[data_df$C0010U1==9] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==9],#
sample(c(1,2,3,4,5,6,7),
size = length(data_df$C0010U1[data_df$C0010U1==9]),
replace = TRUE,
prob = c(1,1,1,1,1,1,1))))
## Generate NUTS3 - which has not correct numbers
data_df$C0010UG4 <- paste0(data_df$C0010UG5,sample(c(51:71),size = N, replace = TRUE))
## Generate AGS (LAU) - which has not correct numbers
data_df$AGS <- paste0(data_df$C0010UG4,sample(c(101:131),size = N, replace = TRUE))
data_df$C0041 <- sample(c(1:3),size = N, replace = TRUE, prob = c(8,2,1))
data_df$C0045 <- sample(c(1,2,NA),size = N, replace = TRUE, prob = c(7,2,1))
data_df$C0072 <- sample(seq(from=1,to=6,by=0.1),size=N,replace=TRUE)
data_df$C0025 <- "S"
head(data_df)
## Randomly select farms as belonging to the population ("N") or sample ("S") - there are additional variables provided for sample farms
for(i in unique(data_df$C0008U1)){
data_df[data_df$C0008U1==i,"C0025"] <- sample(c("N","S"),size = nrow(data_df[data_df$C0008U1==i,]),prob = c(9,3.6),replace = TRUE)
}
## As in 2013 there are only sample farms, delete "N" entries from the data
data_df <- data_df[!(data_df$C0025=="N" & data_df$C0008U1==2013),]
## Randomly give type of farming to farms
for(i in unique(data_df$C0008U1)){
data_df[data_df$C0008U1==i,"C0060UG1"] <- sample(c("15","16","2","35","36_38","45","46","47","48","51","52_53","6","7","83","84"),
size = nrow(data_df[data_df$C0008U1==i,]),
replace = TRUE, prob = c(20,11,2,3,4,16,9,8,7,11,7,8,7,8,7))
}
## Aggregate type of farms to some main groups
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("15","16"),"1",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("2","35","36_38"),"2_3",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("46","47","48"),"46_48",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("51","52_53"),"5",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("6","7","83","84"),"6_7_8",data_df$C0060UG1)
## Generate some import aggregate activities
## - C0240: total utilized agricultural area
## - C0231, C0232, C0233, C0234: grass land activities
## - C0210: arable land
data_df[,"C0240"] <- rnorm(n=nrow(data_df),mean=(70+rnorm(n=1,mean=3,sd=3)),sd=40)
# data_df[,i] <- rcauchy(N, location=10.262, scale=15.102)
data_df[,"C0240"] <- ifelse(data_df[,"C0240"]<0,0,data_df[,"C0240"])
## Grünland
data_df[,"C0231"] <- data_df[,"C0240"]*0.3
data_df[,"C0232"] <- data_df[,"C0240"]*0.1
data_df[,"C0233"] <- data_df[,"C0240"]*0.05
data_df[,"C0234"] <- data_df[,"C0240"]*0.02
## arable land - residual of C0240 and C0231-C0234
data_df[,"C0210"] <- data_df[,"C0240"] - data_df[,"C0231"] - data_df[,"C0232"] - data_df[,"C0233"] - data_df[,"C0234"]
## Randomly generate hectares or herd size (does not matter for fake data) - not for total utilized agricultural area, arable land and grass land activities
for(i in C0codes[!C0codes %in% c("C0240","C0210","C0231","C0232","C0233","C0234")]){
if(is.null(i)){
cat("No additional variables generated")
next
}
data_df[,i] <- rnorm(n=nrow(data_df),mean=(10+rnorm(n=1,mean=3,sd=3)),sd=10)
## adjust negative values
data_df[,i] <- ifelse(data_df[,i]<0,0,data_df[,i])
## adjust too large values
data_df[,i] <- ifelse(data_df[,i]>3*sd(data_df[,i]),0,data_df[,i])
}
return(data_df)
}