"man/man/hello.Rd" did not exist on "e1bb225460f8a32ea7dbf12df886dab1db1d462f"
Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#' This function provides a fake sample data set which has the form of the German FSS data. It is important to verify before if the national RDC provides the data
#' in the form as desired by this function to have a proper fake data set.
#'
#' This function is written for the German Farm Structure Survey data as it was provided in the year 2021.This means, that the generated data will be in a form that fits
#' to the variables that are used from 2010 onwards - C0 codes.
#'
#' In its basic form this function generates data for the years 1999, 2003, 2007, 2010, 2013, 2016 and 2020. For 2013, it is only a sample of the population. The automatically
#' generated variables comprise 4 regional variables, 7 general variables and 7 production based variables.
#'
#' Regional variables:
#' - C0010U1: NUTS1
#' - C0010UG5: NUTS2
#' - C0010UG4: NUTS3
#' - AGS: LAU
#' The regional variables are reasonable, but far away from correct numbers.
#'
#' General variables:
#' - C0008U1: year of survey
#' - nr: farm id
#' - C0072: weighting factor - generated also for non-sample farms - only relevant for sample farms - weighted sum of a specific variable does not lead to the population sum!
#' - C0025: "N" population or "S" sample farm
#' - C0041: legal status - single farm, unincorporate farm (both as private farm) and corporate farm
#' - C0045: 1 full-time farm, 2 part-time farm, NA neither
#' - C0060UG1: farm type - here aggregated to some relevant farm types in Germany
#'
#' Production variables:
#' - C0240: total utilized agricultural area
#' - C0231, C0232, C0233, C0234: grass land activities
#' - C0210: arable land
#' These variables are coherent as grass land and arable land sum up to total land.
#'
#' Any additional variables are not coherent to these production variables.
#'
#' @description
#' This function provides a fake sample data set which has the form of the German FSS data.
#'
#'@author Sebastian Neuenfeldt
#'
#'@depends
#'
#' @param
#' name nobs
#' name years
#' name C0codes
#'
#' @return
#' Fake data set based on German FSS data.
#'
#'@import
#'
#' @export
#'
#' @examples \dontrun{
#' FSS_data_DE <- generateFakeFSSData_DE()
#' }
generateFakeFSSData_DE <- function(nobs=270000,years=c(1999,2003,2007,2010,2013,2016,2020),C0codes=NULL){
cat("\n Build fake data \n")
N <- nobs*length(years)
if(is.null(C0codes)){
data_df <- matrix(nrow = N,ncol = length(c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1")))
data_df <- data.frame(data_df)
colnames(data_df) <- c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1")
} else {
data_df <- matrix(nrow = N,ncol = length(c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1",
C0codes)))
data_df <- data.frame(data_df)
colnames(data_df) <- c("C0008U1","nr","AGS","C0010U1","C0010UG5","C0010UG4","C0072","C0025","C0060UG1",
C0codes)
}
## set years, farm id's, federal states (NUTS1), NUTS2, NUTS3, LAU, legal form, socio-economic type, a weighting factor (but for all farms)
set.seed(1234)
data_df$C0008U1 <- sample(years,size = N, replace = TRUE, prob = c(60,58,56,54,53,52,50)[1:length(years)])
data_df$nr <- 1:N
data_df$C0010U1 <- as.character(sample(c(1:16),size = N, replace = TRUE, prob = c(10,1,25,1,35,15,12,30,30,10,1,20,15,18,12,10)))
data_df$C0010U1 <- ifelse(nchar(data_df$C0010U1)==1,paste0("0",data_df$C0010U1),data_df$C0010U1)
data_df$C0010UG5 <- as.character(paste0(data_df$C0010U1,0))
## Adjust NUTS2 Baden-Württemberg (4), Bayern (7), Hessen (3) und Nordrhein-Westfalen (5)
data_df$C0010UG5[data_df$C0010U1==5] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==5],#
sample(c(1,3,5,7,9),
size = length(data_df$C0010U1[data_df$C0010U1==5]),
replace = TRUE,
prob = c(1,1,1,1,1))))
data_df$C0010UG5[data_df$C0010U1==3] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==3],#
sample(c(1,2,3,4),
size = length(data_df$C0010U1[data_df$C0010U1==3]),
replace = TRUE,
prob = c(1,1,1,1))))
data_df$C0010UG5[data_df$C0010U1==6] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==6],#
sample(c(1,2,3),
size = length(data_df$C0010U1[data_df$C0010U1==6]),
replace = TRUE,
prob = c(1,1,1))))
data_df$C0010UG5[data_df$C0010U1==8] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==8],#
sample(c(1,2,3,4),
size = length(data_df$C0010U1[data_df$C0010U1==8]),
replace = TRUE,
prob = c(1,1,1,1))))
data_df$C0010UG5[data_df$C0010U1==9] <- as.character(paste0(data_df$C0010U1[data_df$C0010U1==9],#
sample(c(1,2,3,4,5,6,7),
size = length(data_df$C0010U1[data_df$C0010U1==9]),
replace = TRUE,
prob = c(1,1,1,1,1,1,1))))
## Generate NUTS3 - which has not correct numbers
data_df$C0010UG4 <- paste0(data_df$C0010UG5,sample(c(51:71),size = N, replace = TRUE))
## Generate AGS (LAU) - which has not correct numbers
data_df$AGS <- paste0(data_df$C0010UG4,sample(c(101:131),size = N, replace = TRUE))
data_df$C0041 <- sample(c(1:3),size = N, replace = TRUE, prob = c(8,2,1))
data_df$C0045 <- sample(c(1,2,NA),size = N, replace = TRUE, prob = c(7,2,1))
data_df$C0072 <- sample(seq(from=1,to=6,by=0.1),size=N,replace=TRUE)
data_df$C0025 <- "S"
head(data_df)
## Randomly select farms as belonging to the population ("N") or sample ("S") - there are additional variables provided for sample farms
for(i in unique(data_df$C0008U1)){
data_df[data_df$C0008U1==i,"C0025"] <- sample(c("N","S"),size = nrow(data_df[data_df$C0008U1==i,]),prob = c(9,3.6),replace = TRUE)
}
## As in 2013 there are only sample farms, delete "N" entries from the data
data_df <- data_df[!(data_df$C0025=="N" & data_df$C0008U1==2013),]
## Randomly give type of farming to farms
for(i in unique(data_df$C0008U1)){
data_df[data_df$C0008U1==i,"C0060UG1"] <- sample(c("15","16","2","35","36_38","45","46","47","48","51","52_53","6","7","83","84"),
size = nrow(data_df[data_df$C0008U1==i,]),
replace = TRUE, prob = c(20,11,2,3,4,16,9,8,7,11,7,8,7,8,7))
}
## Aggregate type of farms to some main groups
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("15","16"),"1",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("2","35","36_38"),"2_3",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("46","47","48"),"46_48",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("51","52_53"),"5",data_df$C0060UG1)
data_df$C0060UG1 <- ifelse(data_df$C0060UG1 %in% c("6","7","83","84"),"6_7_8",data_df$C0060UG1)
## Generate some import aggregate activities
## - C0240: total utilized agricultural area
## - C0231, C0232, C0233, C0234: grass land activities
## - C0210: arable land
data_df[,"C0240"] <- rnorm(n=nrow(data_df),mean=(70+rnorm(n=1,mean=3,sd=3)),sd=40)
# data_df[,i] <- rcauchy(N, location=10.262, scale=15.102)
data_df[,"C0240"] <- ifelse(data_df[,"C0240"]<0,0,data_df[,"C0240"])
## Grünland
data_df[,"C0231"] <- data_df[,"C0240"]*0.3
data_df[,"C0232"] <- data_df[,"C0240"]*0.1
data_df[,"C0233"] <- data_df[,"C0240"]*0.05
data_df[,"C0234"] <- data_df[,"C0240"]*0.02
## arable land - residual of C0240 and C0231-C0234
data_df[,"C0210"] <- data_df[,"C0240"] - data_df[,"C0231"] - data_df[,"C0232"] - data_df[,"C0233"] - data_df[,"C0234"]
## Randomly generate hectares or herd size (does not matter for fake data) - not for total utilized agricultural area, arable land and grass land activities
for(i in C0codes[!C0codes %in% c("C0240","C0210","C0231","C0232","C0233","C0234")]){
if(is.null(i)){
cat("No additional variables generated")
next
}
data_df[,i] <- rnorm(n=nrow(data_df),mean=(10+rnorm(n=1,mean=3,sd=3)),sd=10)
## adjust negative values
data_df[,i] <- ifelse(data_df[,i]<0,0,data_df[,i])
## adjust too large values
data_df[,i] <- ifelse(data_df[,i]>3*sd(data_df[,i]),0,data_df[,i])
}
return(data_df)
}