############################################################################
### ###
### FADNUTILS: ###
### AN R PACKAGE TO EASILY LOAD AND MANIPULATE FADN DATA ###
### ###
############################################################################
## In order to use fadnUtils, we must load fadnUtils and other packages.
## R version >=3.6.1 and < 4.0.0
## Author:
############################################################################
### ###
### SETUP: ###
### DIRECTORY NAMES AND CONDITIONAL FILE ###
### ###
############################################################################
# fadnUtils always work with a user defined data.dir
# Let's assume that the user has not created one yet.
# The following line creates a data.dir folder somewhere in our computer
# We must also have created the raw_str_map.file and pass it as an argument
# to the function. This file is copied to the data.dir folder. Thus, we can
# see the structure of the data contained in a data.dir folder by inspecting
# the raw_str_map.file residing in it.
##################################################################
## Install and load packages ##
##################################################################
requiredPackages = c('fadnUtils','data.table', 'devtools','jsonlite', 'ggplot2')
for(p in requiredPackages){
if(!require(p,character.only = TRUE)) install.packages(p)
library(p,character.only = TRUE)
}
#################################################################
## DIRECTORY NAMES ##
#################################################################
CurrentProjectDirectory = "D:/public/yang/MIND_STEP/New_test_fadnUtils"
##################################################################
## Required files ##
##################################################################
# the path of the fadn files for loading
fadn.data.dir = "D:/public/data/fadn/lieferung_20210414/csv/"
# A json file for extraction
# ceate a data.dir
create.data.dir(folder.path = CurrentProjectDirectory)
# Once the data.dir is created, we must declare that we are working with it
set.data.dir(CurrentProjectDirectory)
get.data.dir()
# After you create a data dir, below is a list of "real-world" example files:
# CurrentProjectDirectory/
# +-- csv
# +-- fadnUtils.metadata.json
# +-- rds
# \-- spool
# \-- readme.txt
############################################################################
### ###
### SECTION 1: ###
### IMPORT CSV FADN DATA ###
### ###
############################################################################
# .............. IMPORT DATA IN TWO STEPS ..........................................#
# However, you can import the file in two steps, one for converting
# the csv to fadn.raw.str (csv-data to raw r-data) and
# one for converting the fadn.raw.rds to fadn.str.rds (raw r-data
# to structured r-data).
#################################################################
## STEP 1: CONVERT CSV TO FADN.RAW.RDS ##
#################################################################
##-----------------------------
## load each file separately
##-----------------------------
# load for a specific country "DEU" and from a specific year "2009"
convert.to.fadn.raw.rds(
file.path = paste0(fadn.data.dir ,"DEU2009.csv"),
sepS = ",",
fadn.country = "DEU",
fadn.year = 2009
#keep.csv = T # copy csv file in csv.dir
)
##-----------------------------
## load all csv files in a folder
##-----------------------------
"csv2raw function takes csv files in a folder and converts them into raw data"
allcsv2raw <- function(LocationofCSVFiles){
# list all csv files
csv_file_names <- list.files(path = LocationofCSVFiles, pattern= "*.csv$")
#csv_file_names <- "DEU, BEL"
for (file in csv_file_names){
# extract first 3 char
country = substr(file, 1, 3)
# extract 4-7 char
year = substr(file, 4, 7)
#year = as.numeric(gsub("\\D+", "", file))
convert.to.fadn.raw.rds(
file.path = paste0(fadn.data.dir,file),
sepS = ",",
fadn.country = country,
fadn.year = year
#keep.csv = T # copy csv file in csv.dir
)
}
}
allcsv2raw(fadn.data.dir)
##-----------------------------
## load specific year and country
##-----------------------------
"C.Y2raw function takes selected countries and years, then converts them into raw data"
C.Y2raw <- function(countries, years){
for (country in countries){
for (year in years){
file = paste0(country,year,".csv")
convert.to.fadn.raw.rds(
file.path = paste0(fadn.data.dir,file),
sepS = ",",
fadn.country = country,
fadn.year = year
#keep.csv = T # copy csv file in csv.dir
)
}
}
}
# load countries: BEL, DEU and NED
countriesList = c("BEL", "DEU", "NED")
yearsList = c(2009,2010,2011,2018)
C.Y2raw(countries = countriesList, years =yearsList )
show.data.dir.contents()
# If you converted the csv to raw r-data successfully, raw r-data files are saved in "rds" folder,
# the project's files and folders look like this:
# New_test_fadnUtils/
# +-- csv
# +-- fadnUtils.metadata.json
# +-- rds
# | +-- fadn.raw.2009.BEL.compressed.rds
# | +-- fadn.raw.2009.BEL.rds
# | +-- fadn.raw.2010.BEL.compressed.rds
# | +-- fadn.raw.2010.BEL.rds
# | +-- fadn.raw.2011.BEL.compressed.rds
# | +-- fadn.raw.2011.BEL.rds
# | +-- fadn.raw.2012.BEL.compressed.rds
# | \-- fadn.raw.2012.BEL.rds
# \-- spool
# \-- readme.txt
##################################################################
## STEP 2: CONVERT FADN.RAW.RDS TO FADN.STR.RDS ##
##################################################################
#######################################################################################################
# Notices:#
###########
## Before converting raw r-data into str r-data, it is recommended to use check.column() method
## so that all variables in this json file can be converted.
## The conversion of the raw r-data file to a structured r-data file is driven by a human-readable file,
## called raw_str_map.json.
## This json file is saved in extraction_dir by default.
## if you want to use raw_str_map.json by default, please put this file in extraction_dir.
## Or the user can define a external json file where it is
## and how to caculate the str r-data.
#######################################################################################################
rds.dir = paste0(get.data.dir(),"/rds/")
# set a str name for for saving the str r-data in rds.dir
new.str.name = "test"
# set a extraction_dir
dir.create(paste0(rds.dir, new.str.name))
new.extraction.dir = paste0(rds.dir, new.str.name)
##-----------------------------------------------------------------------------
## Step 2.0: Check the variables of loaded a raw rds data and a json file --
##-----------------------------------------------------------------------------
# Save the modifed json file
list_vars = check.column(importfilepath = paste0(rds.dir, "fadn.raw.2009.BEL.rds"), # a rds file or a csv file
jsonfile = "D:/public/yang/MIND_STEP/2014_after_copy.json", # a json file
rewrite_json = TRUE, # write a new json file without unmatched variables
extraction_dir = new.extraction.dir # save the new json in extraction_dir
)
# Let's see the unmathcted variables in this json file
print(list_vars)
##---------------------------------------------------------------
## Step 2.1: convert convert the raw r-data into str r-data --
##---------------------------------------------------------------
#check the default json file in extraction_dir
if ("raw_str_map.json" %in% list.files(new.extraction.dir, pattern = "\\.json$")){
cat(new.extraction.dir, "has a raw_str_map.json.", "\n")
}else{warning("please put a raw_str_map.json in ", new.extraction.dir,"\n", "Or using a external json file (option 2)", "\n")}
##::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
## option 1: convert the file separately using a raw_str_map.json in extraction_dir
## making sure that a raw_str_map.json is in extraction_dir
##::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
convert.to.fadn.str.rds(fadn.country = "BEL",
fadn.year = 2009,
str.name = new.str.name # extraction_dir
)
##::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
## option 2: convert the file separately using a external json file
##::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# If force_external_raw_str_map is TRUE
# this external json file will be copied to extraction_dir as raw_str_map.json,
convert.to.fadn.str.rds(fadn.country = "BEL",
fadn.year = 2009,
raw_str_map.file = "D:/public/yang/MIND_STEP/new_sample/test01/raw_str_map.json", # a external json file
str.name = new.str.name, # extraction_dir
force_external_raw_str_map = T,
DEBUG = F
)
##::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
## option 3: convert mutilple raw r-data files in rds.dir into str r-data in extraction_dir
##::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
"raw2str function will help us to convert the raw r-data into the str r-data in rds.dir completely.
This function takes a user-defined raw_str_map.file and a logical constant which is FALSE by default,
and converting raw data to str data."
raw2str <- function(Current_raw_str_map.file = NULL, overwrite_external_json = F){
rds.dir = paste0(get.data.dir(), "/rds")
raw_file_names <- dir(rds.dir, pattern ="\\.rds$" )
for (file in raw_file_names){
if (!grepl("compressed", file)){
# extract first 3 char
country = substr(file, 15, 17)
# extract number
year = as.numeric(gsub("\\D+", "", file))
cat("converting the str data for country: ", country, " and year: ", year, "\n")
tryCatch(
expr = {
convert.to.fadn.str.rds(fadn.country = country,
fadn.year = year,
raw_str_map.file = Current_raw_str_map.file,
force_external_raw_str_map = overwrite_external_json,
str.name = new.str.name
)
},
warning = function(w){
message('Caught an warning!')
print(w)
},
error = function(e) {
message("Caught an error! Please check the objects in json file using check.column() (see more in USE_CASE_4.R).")
#cat("Wrong, can't convert the str r-data!",sep = "\n")
print(e)
}
)
}
else{cat("It's compressed data!",sep = "\n")}
}
}
#---------------------------------------------------------------
# option 3.1: using a raw_str_map.json by defaut
#---------------------------------------------------------------
# convert str data with a default json file, make sure that a raw_str_map.json in extraction_dir
raw2str()
# or
raw2str(Current_raw_str_map.file = "D:/public/yang/MIND_STEP/new_sample/test01/raw_str_map.json", overwrite_external_json = F)
#---------------------------------------------------------------
# option 3.2: using a external json file
#---------------------------------------------------------------
# convert str data using a external json file
raw2str(Current_raw_str_map.file = "D:/public/yang/MIND_STEP/new_sample/test01/raw_str_map.json", overwrite_external_json = T)
show.data.dir.contents()
# If str r-data was converted, the str r-data is saved in "test"(new.str.name) folder as below.
# New_test_fadnUtils/
# +-- csv
# +-- fadnUtils.metadata.json
# +-- rds
# | +-- fadn.raw.2009.BEL.compressed.rds
# | +-- fadn.raw.2009.BEL.rds
# | +-- fadn.raw.2010.BEL.compressed.rds
# | +-- fadn.raw.2010.BEL.rds
# | +-- fadn.raw.2011.BEL.compressed.rds
# | +-- fadn.raw.2011.BEL.rds
# | +-- fadn.raw.2012.BEL.compressed.rds
# | +-- fadn.raw.2012.BEL.rds
# | \-- test
# | +-- fadn.str.2009.BEL.rds
# | +-- raw_str_map.json
# | \-- rewrite_2014_after_copy.json
# \-- spool
# +-- my_logfile.txt
# \-- readme.txt
###########################################################################
### ###
### SECTION 2: ###
### LOAD R-DATA FROM A DATA.DIR ###
### ###
###########################################################################
#################################################################
## LOAD RAW R-DATA ##
#################################################################
### We can either load raw r-data files (the original FADN csv in r-friendly format),
### or structured r-data files (the original data transformed into meaningful
### information)
# To load raw r-data, only for BEL and 2009
my.data = load.fadn.raw.rds(
countries = "BEL",
years = 2009
)
# my.data is a single large data.table, with the original csv columns and rows
nrow(my.data) #Number of rows
names(my.data) #Column names
length(names(my.data)) #Number of columns
str(my.data) #Overall structure
##################################################################
## LOAD STRUCTURED R-DATA ##
##################################################################
#To load structured data, for BEL and 2009
my.data.2009 = load.fadn.str.rds(
countries = "BEL",
years = 2009,
extraction_dir = new.str.name # Location of the str r-data
)
# You can see that my.data is a list, with three elements: info, costs, crops
str(my.data.2009)
# You can access each individual element like this
str(my.data.2009$info)
str(my.data.2009$costs)
str(my.data.2009$crops)
# The first columns of each of the above elements (info, costs, crops)
# are created according to the ID section of the raw_str_map
names(my.data.2009$info)
names(my.data.2009$costs)
names(my.data.2009$crops)
# info and costs data.tables are in wide-format (each observation in a single row,
# all attributes of a single observation in different columns).
# crops element is in long format (one observation is in many rows,
#
#
# See https://seananderson.ca/2013/10/19/reshape/ for
# discussion of the two types of data formats
head(my.data.2009$info)
head(my.data.2009$costs)
head(my.data.2009$crops)
# Also on the attributes section of each of the above elements, we can access
# the column formulas and descriptions, as defined in the raw_str_map file.
View(
attr(my.data.2009$info,"column.descriptions")
)
View(
attr(my.data.2009$costs,"column.descriptions")
)
View(
attr(my.data.2009$crops,"column.descriptions")
)
# Especially for the crops element, we can also see the description
# CROP column
View(
attr(my.data.2009$crops,"crops.descriptions")
)
#################################################################
## LOAD COUNTRIES-YEARS COMBINATIONS ##
#################################################################
### In the previous examples, we showed how to load data for one country and
### one year In the following examples we show more combinations.
#To load for DEU and NED for year 2015
my.data = load.fadn.str.rds(countries = c("DEU","NED"), years = c(2009,2010,2011), extraction_dir = new.str.name )
#To load for DEU and NED for all years
my.data = load.fadn.str.rds(countries = c("DEU","NED"),extraction_dir = new.str.name )
#To load all available countries for year 2015
my.data = load.fadn.str.rds(years = 2015, extraction_dir = new.str.name)
#To load all availabel data
my.data = load.fadn.str.rds(extraction_dir = new.str.name)
#################################################################
## HOW TO STORE THE LOAD ##
#################################################################
#TODOS
# Since loading data sometimes takes time and create big datasets
# fadUtils offers a way to save the dataset created from the load call
# The first step is to store the loaded data
# Provide the object to save, a name and a description
# store.rds.data(my.data,"everything","all countries and years are here")
############################################################################
############################################################################
### ###
### SECTION3: ###
### PERFORM ANALYSIS ###
### ###
############################################################################
############################################################################
#We load structured data for all available countries and years
my.str.data = load.fadn.str.rds(extraction_dir = new.str.name)
##----------------------------------------------------------------
## HOW MANY FARMS FOR EACH COUNTY AND EACH YEAR --
##----------------------------------------------------------------
# we use the info DT, and group by YEAR-COUNTRY
my.str.data$info[,.N,by=list(YEAR,COUNTRY)]
#We can also use dcast, to show a more tabular format
dcast(
my.str.data$info,
YEAR~COUNTRY,
fun.aggregate = length,
value.var =
)
# We can also export to clipboard, using the write.excel utility function
# After running the following command, open excel and paste. The result will appear.
write.excel(
dcast(
my.str.data$info,
YEAR~COUNTRY,
fun.aggregate = length,
value.var =
)
)
##---------------------------------------------------------------
## ALL CROP AREAS PER COUNTRY-YEAR --
##---------------------------------------------------------------
# First, calculate the weighted area
my.str.data$crops[
VARIABLE=="LEVL",
VALUE.w:=WEIGHT*VALUE/1000
]
# Then dcast that variable
dcast(
my.str.data$crops[VARIABLE=="LEVL"],
COUNTRY+CROP~YEAR,
value.var = "VALUE.w",
fun.aggregate = sum,
na.rm = T
)
##---------------------------------------------------------------
## ALL CROP PRODUCTION PER COUNTRY-YEAR --
##---------------------------------------------------------------
dcast(
my.str.data$crops[VARIABLE=="GROF",VALUE.w:=WEIGHT*VALUE/1000],
COUNTRY+CROP~YEAR,
value.var = "VALUE.w",
fun.aggregate = sum,
na.rm = T
)
##---------------------------------------------------------------
## BARLEY PRODUCTION PER COUNTRY-YEAR --
##---------------------------------------------------------------
dcast(
my.str.data$crops[
VARIABLE=="GROF" & CROP=="BARL",
VALUE.w:=WEIGHT*VALUE/1000
],
COUNTRY~YEAR,
value.var = "VALUE.w",
fun.aggregate = sum,
na.rm = T
)
##----------------------------------------------------------------
## DISTRIBUTION OF NUMBER OF CROPS PER COUNTRY-YEAR --
##----------------------------------------------------------------
crops.data = my.str.data$crops #catering for easier access at next steps
#this contains the number of crops for each farm-country-year/
# Be carefule, we hav to filter to count only the LEVL variable
crops.data.Ncrops = crops.data[VARIABLE=="LEVL",.N,by=list(COUNTRY,YEAR,ID)]
# This displays the quantiles of the number of crops
crops.data.Ncrops[,as.list(quantile(N)),by=list(YEAR,COUNTRY)][order(COUNTRY)]
# R excels on graphic representation of results
library(ggplot2)
ggplot(crops.data.Ncrops,aes(y=N,x=1)) +
geom_boxplot() +
facet_grid(YEAR~COUNTRY) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()
)+
ylab("Number of Crops")
##---------------------------------------------------------------------------
## COLLECT COMMON ID FROM LOADED STRUCTURED R-DATA AND LOADED RAW R_DATA --
##---------------------------------------------------------------------------
# Collection the common id from loaded str r-data
collected.common.id_str = collect.common.id(my.str.data)
# Collection the common id from loaded raw r-data
collected.common.id_raw = collect.common.id(my.data)
##---------------------------------------------------------------
## CALCULATION BASED ON COLLECETED COMMON ID --
##---------------------------------------------------------------
# sommaries for infos
# sample and representend number of farms
my.str.data$info[,list(Nobs_sample=.N,Nobs_represented=sum(WEIGHT)),
by=.(COUNTRY,YEAR)]
# only for full sample (common id over years in selected data)
my.str.data$info[id %in% collected.common.id_str[[1]],
list(Nobs_sample=.N,
Nobs_represented=sum(WEIGHT)),
by=.(COUNTRY,YEAR)]
# some summaries for crops
# unweighted and weighted sum over countries, years, crops and variables (EAAP GROF INTF LEVL SHARE UVAG UVSA)
# EAA: Economic Accounts of Agriculture
# EAAP: EAA value at producer prices
# GROF
my.str.data$crops[ ,list(VALUE=sum(VALUE),
VALUE_weighted=sum(VALUE*WEIGHT)),
by=.(COUNTRY,YEAR,CROP,VARIABLE)]
# only for full sample (common id over years in selected data)
my.str.data$crops[ID %in% collected.common.id_str[[1]],
list(VALUE=sum(VALUE),
VALUE_weighted=sum(VALUE*WEIGHT)),
by=.(COUNTRY,YEAR,CROP,VARIABLE)]
my.str.data$crops[ID %in% collected.common.id_str[,common_id],
list(VALUE=sum(VALUE),
VALUE_weighted=sum(VALUE*WEIGHT)),
by=.(COUNTRY,YEAR,CROP,VARIABLE,ID)]
##---------------------------------------------------------------
## Load fadn raw data and search the the number of common id for adjacent combination years
##---------------------------------------------------------------
# find all adjacent combinations in a list
myFun <- function(Data) {
A <- lapply(1:(length(Data)), sequence)
B <- lapply(rev(lengths(A))-1L, function(x) c(0, sequence(x)))
unlist(lapply(seq_along(A), function(x) {
lapply(B[[x]], function(y) Data[A[[x]]+y])
}), recursive = FALSE, use.names = FALSE)
}
# add a string to the facet label text and split it in two lines
label_facet <- function(original_var, custom_name){
lev <- levels(as.factor(original_var))
lab <- paste0(lev, " \n ",custom_name)
names(lab) <- lev
return(lab)
}
# multi-panel plots using facet_wrap() for dynamic choice
figure <- function(country, df, n){
p = df %>%
# reorder by Num_id
#ggplot( aes(x = reorder(Years, -Num_id) ,y=Num_id))
ggplot( aes(x = Years ,y=Num_id))+
geom_bar( stat="identity",
position = position_dodge(width = 0.8),
width=0.5,
#fill = rainbow(n=length(df$Num_id))
fill = "#00abff"
) +
coord_flip()+
#labs(title = paste0("Plot of the Number of common ID for country: ", country ), fill = "Years") +
xlab("Years") +
ylab("Number of common ID") +
geom_text(aes(label=Num_id), vjust=0.5, colour="black", size=3.5)+
theme(axis.text.x=element_text(color = "black", size=6, angle=0, vjust=.8, hjust=0.8)) +
scale_x_discrete(labels = function(x) str_wrap(x, width = n)) +
facet_wrap( ~ group, scales="free",
#labeller=names
labeller = labeller(group = label_facet(df$group, "adjacent combinations"))
)+
ggtitle(paste0("Number of common ID for country: ", country )) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
p
}
# raw data
output_common_id <- function(countires_list, saveExcel = TRUE, savePlots = TRUE){
rds.dir = paste0(get.data.dir(),"/rds/")
library(xlsx)
library(openxlsx)
xlsx_file <- paste0(get.data.dir(), "/spool/" ,"fadn_common_id.xlsx")
outlist = list()
wb <- createWorkbook(xlsx_file)
for (country in countires_list){
cat("Country:", country, '\n')
raw_file_names <- dir(rds.dir, pattern = paste0(country, "\\.rds$") )
years_list = as.numeric(gsub("\\D+", "", raw_file_names))
adjacent_list = myFun(years_list)
my.data = list()
for (year_items in adjacent_list) {
name = toString(year_items)
data = load.fadn.raw.rds(countries = country, years = year_items)
if (length(data) == 0) {cat(year_items, " not exist")}
else {my.data[[name]] = data}
}
Big.Num.Common.id = list()
for (data_list in names(my.data)){
common.id = collect.common.id(my.data[[data_list]])
Big.Num.Common.id[[data_list]] = nrow(common.id)
}
DF = do.call(rbind, Big.Num.Common.id)
DF = data.frame(DF)
colnames(DF) <- "Num_id"
DF$Years <- row.names(DF)
outlist[[country]] = DF
#wb <- loadWorkbook(xlsx_file)
if (!(country %in% names(wb))) addWorksheet(wb, country)
#write a df to xlsx
# write.xlsx(DF,
# file= xlsx_file,
# sheetName = country,
# col.names= TRUE,
# row.names = FALSE,
# append = TRUE)
print("write")
writeData(wb,country, DF)
#}
if (savePlots == TRUE){
library(ggplot2)
library(stringr)
DF$Length <- str_count(DF$Years)
DF$group <- cut(DF$Length, breaks=c(1,5,14,18,25,30,35,40,48,55,58,70,Inf))
levels(DF$group) <- c("1 year","2 years","3 years","4 years","5 years","6 years","7 years",
"8 years","9 years","10 years","11, 12 years",">12 years")
if (country == "NED"){
p <- figure(country,DF, 35)}
else{p <- figure(country,DF, 20)}
ggsave(plot = p,
filename = paste0(get.data.dir(), "/plots/",country ,"_plot.png"),
width = 18, height = 8)}
}
saveWorkbook(wb, xlsx_file, overwrite = T)
return(outlist)
}
DF <- output_common_id(c("DEU","NED", "BEL"))