Newer
Older
#' This function provides a fake sample data set which has the form of the German FSS data. It is important to verify before if the national RDC provides the data
#' in the form as desired by this function to have a proper fake data set.
#'
#' This function is written for the German Farm Structure Survey data as it was provided in the year 2021.This means, that the generated data will be in a form that fits
#' to the variables that are used from 2010 onwards - C0 codes.
#'
#' In its basic form this function generates data for the years 1999, 2003, 2007, 2010, 2013, 2016 and 2020. For 2013, it is only a sample of the population. The automatically
#' generated variables comprise 4 regional variables, 7 general variables and 7 production based variables.
#'
Sebastian Neuenfeldt
committed
#' \emph{Regional variables}:
#' \itemize{
#' \item C0010U1: NUTS1
#' \item C0010UG5: NUTS2
#' \item C0010UG4: NUTS3
#' \item AGS: LAU
#' }
#' The regional variables are reasonable, but far away from correct numbers.
#'
Sebastian Neuenfeldt
committed
#' \emph{General variables}:
#' \itemize{
#' \item C0008U1: year of survey
#' \item nr: farm id
#' \item C0072: weighting factor - generated also for non-sample farms - only relevant for sample farms - weighted sum of a specific variable does not lead to the population sum!
#' \item C0025: "N" population or "S" sample farm
#' \item C0041: legal status - single farm, unincorporate farm (both as private farm) and corporate farm
#' \item C0045: 1 full-time farm, 2 part-time farm, NA neither
#' \item C0060UG1: farm type - aggregated to some relevant farm types in Germany
#' }
#' \emph{Production variables}:
#' \itemize{
#' \item C0240: total utilized agricultural area
#' \item C0231, C0232, C0233, C0234: grass land activities
#' \item C0210: arable land
#' }
#' These variables are coherent as grass land and arable land sum up to total land.
#'
Sebastian Neuenfeldt
committed
#' Any additional variables provided via C0codes argument are not coherent to these production variables.
#'
#' @description
#' This function provides a fake sample data set which has the form of the German FSS data.
#'
#'@author Sebastian Neuenfeldt
#'
#'
Sebastian Neuenfeldt
committed
#' @param nobs Number of observations approximately to be generated
#'
#' @param years Years of survey
#'
#' @param C0codes Optional variables to be generated, only meaningful for continues variables.
#'
#' @return
Sebastian Neuenfeldt
committed
#' Retruns a fake data set based on German FSS data.
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#'
#'@import
#'
#' @export
#'
#' @examples \dontrun{
#' FSS_data_DE <- generateFakeFSSData_DE()
#' }
generateFakeFSSData_DE <- function(nobs=270000,years=c(1999,2003,2007,2010,2013,2016,2020),C0codes=NULL){
cat("\n Build fake data \n")
N <- nobs*length(years)
if(is.null(C0codes)){
data_df <- matrix(nrow = N,ncol = length(c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1")))
data_df <- data.frame(data_df)
colnames(data_df) <- c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1")
} else {
data_df <- matrix(nrow = N,ncol = length(c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1",
C0codes)))
data_df <- data.frame(data_df)
colnames(data_df) <- c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1",
C0codes)
}
## set years, farm id's, federal states (NUTS1), NUTS2, NUTS3, LAU, legal form, socio-economic type, a weighting factor (but for all farms)
set.seed(1234)
data_df$C0008U1 <- sample(years,size = N, replace = TRUE, prob = c(60,58,56,54,53,52,50)[1:length(years)])
data_df$nr <- 1:N
data_df$C0010U1 <- as.character(sample(c(1:16),size = N, replace = TRUE, prob = c(10,1,25,1,35,15,12,30,30,10,1,20,15,18,12,10)))
data_df$C0010U1 <- ifelse(nchar(data_df$C0010U1)==1,paste0("0",data_df$C0010U1),data_df$C0010U1)
data_df$C0010UG5 <- as.character(paste0(data_df$C0010U1,0))
## Adjust NUTS2 Baden-Württemberg (4), Bayern (7), Hessen (3) und Nordrhein-Westfalen (5)
data_df$C0010UG5[data_df$C0010U1==5] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==5],#
sample(c(1,3,5,7,9),
size = length(data_df$C0010U1[data_df$C0010U1==5]),
replace = TRUE,
prob = c(1,1,1,1,1))))
data_df$C0010UG5[data_df$C0010U1==3] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==3],#
sample(c(1,2,3,4),
size = length(data_df$C0010U1[data_df$C0010U1==3]),
replace = TRUE,
prob = c(1,1,1,1))))
data_df$C0010UG5[data_df$C0010U1==6] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==6],#
sample(c(1,2,3),
size = length(data_df$C0010U1[data_df$C0010U1==6]),
replace = TRUE,
prob = c(1,1,1))))
data_df$C0010UG5[data_df$C0010U1==8] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==8],#
sample(c(1,2,3,4),
size = length(data_df$C0010U1[data_df$C0010U1==8]),
replace = TRUE,
prob = c(1,1,1,1))))
data_df$C0010UG5[data_df$C0010U1==9] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==9],#
sample(c(1,2,3,4,5,6,7),
size = length(data_df$C0010U1[data_df$C0010U1==9]),
replace = TRUE,
prob = c(1,1,1,1,1,1,1))))
## Generate NUTS3 - which has not correct numbers
data_df$C0010UG4 <- paste0(data_df$C0010UG5,sample(c(51:71),size = N, replace = TRUE))
## Generate AGS (LAU) - which has not correct numbers
data_df$AGS <- paste0(data_df$C0010UG4,sample(c(101:131),size = N, replace = TRUE))
data_df$C0041 <- sample(c(1:3),size = N, replace = TRUE, prob = c(8,2,1))
data_df$C0045 <- sample(c(1,2,NA),size = N, replace = TRUE, prob = c(7,2,1))
data_df$C0072 <- sample(seq(from=1,to=6,by=0.1),size=N,replace=TRUE)
data_df$C0025 <- "S"
head(data_df)
## Randomly select farms as belonging to the population ("N") or sample ("S") - there are additional variables provided for sample farms
for(i in unique(data_df$C0008U1)){
data_df[data_df$C0008U1==i,"C0025"] <- sample(c("N","S"),size = nrow(data_df[data_df$C0008U1==i,]),prob = c(9,3.6),replace = TRUE)
}
## As in 2013 there are only sample farms, delete "N" entries from the data
data_df <- data_df[!(data_df$C0025=="N" & data_df$C0008U1==2013),]
## Randomly give type of farming to farms
for(i in unique(data_df$C0008U1)){
data_df[data_df$C0008U1==i,"C0060UG1"] <- sample(c("15","16","2","35","36_38","45","46","47","48","51","52_53","6","7","83","84"),
size = nrow(data_df[data_df$C0008U1==i,]),
replace = TRUE, prob = c(20,11,2,3,4,16,9,8,7,11,7,8,7,8,7))
}
## Aggregate type of farms to some main groups
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("15","16"),"1",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("2","35","36_38"),"2_3",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("46","47","48"),"46_48",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("51","52_53"),"5",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("6","7","83","84"),"6_7_8",data_df$C0060UG1)
## Generate some import aggregate activities
## - C0240: total utilized agricultural area
## - C0231, C0232, C0233, C0234: grass land activities
## - C0210: arable land
data_df[,"C0240"] <- rnorm(n=nrow(data_df),mean=(70+rnorm(n=1,mean=3,sd=3)),sd=40)
# data_df[,i] <- rcauchy(N, location=10.262, scale=15.102)
data_df[,"C0240"] <- ifelse(data_df[,"C0240"]<0,0,data_df[,"C0240"])
## Grünland
data_df[,"C0231"] <- data_df[,"C0240"]*0.3
data_df[,"C0232"] <- data_df[,"C0240"]*0.1
data_df[,"C0233"] <- data_df[,"C0240"]*0.05
data_df[,"C0234"] <- data_df[,"C0240"]*0.02
## arable land - residual of C0240 and C0231-C0234
data_df[,"C0210"] <- data_df[,"C0240"] - data_df[,"C0231"] - data_df[,"C0232"] - data_df[,"C0233"] - data_df[,"C0234"]
## Randomly generate hectares or herd size (does not matter for fake data) - not for total utilized agricultural area, arable land and grass land activities
for(i in C0codes[!C0codes %in% c("C0240","C0210","C0231","C0232","C0233","C0234")]){
if(is.null(i)){
cat("No additional variables generated")
next
}
data_df[,i] <- rnorm(n=nrow(data_df),mean=(10+rnorm(n=1,mean=3,sd=3)),sd=10)
## adjust negative values
data_df[,i] <- ifelse(data_df[,i]<0,0,data_df[,i])
## adjust too large values
data_df[,i] <- ifelse(data_df[,i]>3*sd(data_df[,i]),0,data_df[,i])
}
return(data_df)
}