#' Imports a DG-AGRI csv into fadnUtils #' #' It first call the convert.to.fadn.raw.rds and then convert.to.fadn.str.rds #' #' @param file.path the full path of the file (the filename must be included) #' @param raw.f the raw_str_map file to use. it must reside inside 'raw_str_maps; folder of the data.dir #' @param sepS the separator of the csv files (by default ",") #' @param fadn.year the year the csv files refers to (e.g. 2001) #' @param fadn.country the three letter country code the csv files refers to (e.g. "ELL") #' @param keep.csv if TRUE, copy the csv files; else do not copy #' #' @return NULL #' @export #' #' @examples import.fadn.csv <- function (file.path, raw.f=NULL, sepS=",", fadn.year= NA, fadn.country = NA, keep.csv=F) { #if file exist if(!file.exists(file.path)) { cat(paste0("File ",file.path," does not exist. Exiting ...\n")) return(invisible(FALSE)) } # check for fadnUtils.data.dir if(is.null(get.data.dir())) { cat("You have first to set the fadnUtils.data.dir using set.data.dir function. Exiting ....\n") return(FALSE) } else { data.dir = get.data.dir(); csv.file = basename(file.path) } if(is.null(raw.f)) { cat("You have to give a raw_str_map. Exiting ....\n") return(FALSE) } if(convert.to.fadn.raw.rds(file.path,sepS,fadn.year,fadn.country,keep.csv)) { convert.to.fadn.str.rds(fadn.country,fadn.year,raw.f) } else { cat("Failed to import. Exiting ...\n") return(invisible(NULL)) } } #' Gets a fadn.raw.csv (csv file from DG-AGRI) and transforms it accordingly to fadn.raw.rds #' #' It saves two files: #' - One that contain a wide format of the data, i.e. in tabular format that is identical to the csv data. This is uncompressed data. #' - One that holds the same information in compressed data. It is a list that contains $data.char and $data.num data.tables in long format. 0 values are removed and only the col.id is the index on both data.tables #' #' @param file.path the full path of the csv file (the filename must be included) #' @param sepS the separator of the csv files (by default ",") #' @param fadn.year the year the csv files refers to (e.g. 2001) #' @param fadn.country the three letter country code the csv files refers to (e.g. "ELL") #' @param keep.csv if TRUE, copy the csv files to the CSV directory; else do not copy #' #' @return Saves the fadn.raw.rds file and returns TRUE if everything goes well #' @import data.table #' #' @export #' @examples convert.to.fadn.raw.rds <- function(file.path="", sepS=",", fadn.year= NA, fadn.country = NA, keep.csv = F, col.id = "ID") { library(data.table) #if file exist if(!file.exists(file.path)) { cat(paste0("File ",file.path," does not exist. Exiting ...\n")) return(FALSE) } # check for fadnUtils.data.dir if(is.null(get.data.dir())) { cat("You have first to set the fadnUtils.data.dir using set.data.dir function. Exiting ....\n") return(FALSE) } else { data.dir = get.data.dir(); csv.file = basename(file.path) } if(file.exists(paste0(data.dir,"/csv/",csv.file))) {cat("File exists. Overwriting ...\n")} #copy csv to data.dir/csv if(keep.csv) { print(" copying file") file.copy(file.path,paste0(data.dir,"/csv/",csv.file)) } #convert to uncompressed rds and save print(" creating fadn.raw.rds") data.raw = data.table(read.csv(file.path,header = T, as.is = T)) attr(data.raw,"original.file.path") <-file.path attr(data.raw,"fadn.year")<-fadn.year attr(data.raw,"fadn.country")<-fadn.country data.name = paste0("fadn.raw.",fadn.year,".",fadn.country,".rds") saveRDS(data.raw,paste0(data.dir,"/rds/",data.name)) #convert to compressed rds and save data.raw.classes = data.table(col.name=names(data.raw),col.class=sapply(data.raw,class)) data.raw.compr = list() char.cols = c(col.id, data.raw.classes[col.class=="character",col.name]) data.raw.compr$data.char = data.raw[,..char.cols] num.cols = c(col.id, data.raw.classes[!col.class=="character",col.name]) data.raw.compr$data.num = melt(data.raw[,..num.cols],id.vars = col.id)[!value==0] attr(data.raw.compr,"original.file.path") <-file.path attr(data.raw.compr,"fadn.year")<-fadn.year attr(data.raw.compr,"fadn.country")<-fadn.country attr(data.raw.compr,"col.names")<-names(data.raw) attr(data.raw.compr,"col.id")<-col.id data.name = paste0("fadn.raw.",fadn.year,".",fadn.country,".compressed.rds") saveRDS(data.raw.compr,paste0(data.dir,"/rds/",data.name)) return(invisible(TRUE)) } #' Converts an fadn.raw.rds file to fadn.str.rds file using a raw_str_map.json file #' #' The raw_str_map.json specification is as follows: #' #' { #' "id": { "COLUMN in every list member in RDS": "COLUMN IN CSV", ....}, #' "info": { "COLUMN in info RDS": "COLUMN IN CSV", ....}, #' "livestock": {} #' "crops": { #' "CROP NAME 1": {"description": "description of crop name", "columns": {"VARIABLE NAME": COLUMN IN CSV", ....} }, #' "CROP NAME 2": {"description": "description of crop name", "columns": {"VARIABLE NAME": COLUMN IN CSV", ....} }, #' .... #' } #' } #' #' #' The structure of the str.dir: #' - A data.dir can hold more than one extractions. #' - Each extraction has a short name (20 or less characters, whitespace is not allowed) #' - Each extraction is stored in the data.dir/rds/ #' - That folder contains the following files: #' + raw_str_map.json: the raw_str_map #' + fadn.str.<4-digit YEAR>.<3-letter COUNTRY>.rds: the extracted data #' #' Notes: #' 1) The computed RDS file contains a list structure with the following keys: info, costs, livestock-animals and crops #' All are data.tables. For all of them, the first columns are those that are contained in the "id" object #' "info" and "costs" are in table format, i.e. each farm is one row and data is on columns, as defined in the #' related raw_str_map.json file. #' "crops" and "livestock-animals" are in wide data format (https://tidyr.tidyverse.org/), where one farm lies accross many rows, and each #' row is a farm-crop-variableName-value combination #' #' 2) In $id, $info and $costs, "COLUMN IN CSV" can have two forms #' i) a single column name in the fadn.raw csv file or a combination, e.g. "K120SA+K120FC+K120FU+K120CV-K120BV" #' ii) the form of an object {"source": "the column in the csv", "description": "a description of what this column is about"} #' #' 3) We attach certain attributes that are useful for identifying informations: #' i) In $info and $costs, the attribute "column description" provide information of the formula and the description of each column #' ii) In $crops and $livestock-animals, the attribute "$crops.descriptions" and "$livestock.descriptions", provide the description of each CROP contained there #' iii) In $crops and $ the attribute "$column.formulas" provide the formulas used in order to derive the VALUE #' #' #' #' #' @param fadn.country string with the country to extract the str data #' @param fadn.year the year to extract the structured data #' @param raw_str_map.file the full path to the raw_str_map file. #' @param str.short_name the short name of the str data. No spaces and text up to 20 characters #' @param DEBUG if TRUE, prints more details on the conversion process #' #' @return Saves the rds.str.fadn and returns TRUE if everything goes well #' #' @export #' #' @examples convert.to.fadn.str.rds <-function(fadn.country = NA, fadn.year= NA, raw_str_map.file=NULL, force_external_raw_str_map=FALSE, str.name = NULL, DEBUG=F ) { #check if str.short_name abides to specification if(nchar(str.name)>20){ cat("str.name more should be 20 character and less. Exiting ....\n") return(invisible(FALSE)) } if (grepl("\\s",str.name)) { cat("str.name should not contain any kind of whitespace. Exiting ....\n") return(invisible(FALSE)) } # check for fadnUtils.data.dir if(is.null(get.data.dir())) { cat("You have first to set the fadnUtils.data.dir using set.data.dir function. Exiting ....\n") return(invisible(FALSE)) } #if data.dir is a proper dat.dir if(!check.data.dir.structure()) { cat("data.dir does not have a proper structure. Exiting ....\n") return(invisible(FALSE)) } else { data.dir = get.data.dir(); rds.dir = paste0(data.dir,"/rds/") str.dir = paste0(rds.dir,"/",str.name,"/") } #create/get the raw_str_map.json file if(is.null(raw_str_map.file) & !file.exists(paste0(str.dir,"raw_str_map.json"))) { cat("either provide a raw_str_map or an existing extraction dir Exiting ....\n") return(invisible(FALSE)) } #create/get the raw_str_map.json file ---- ##case a if(!is.null(raw_str_map.file) & !file.exists(paste0(str.dir,"raw_str_map.json"))) { dir.create(str.dir) file.copy(raw_str_map.file,paste0(str.dir,"raw_str_map.json"))#copy the file cat("\n", raw_str_map.file, " copied to ", paste0(str.dir,"raw_str_map.json\n")) } ##case b if(!is.null(raw_str_map.file) & file.exists(paste0(str.dir,"raw_str_map.json")) & force_external_raw_str_map==T) { file.copy(raw_str_map.file,paste0(str.dir,"raw_str_map.json"), overwrite = TRUE)#copy the file cat("\n", raw_str_map.file, " copied to ", paste0(str.dir,"raw_str_map.json\n")) } ##case c if(!is.null(raw_str_map.file) & file.exists(paste0(str.dir,"raw_str_map.json")) & force_external_raw_str_map==F) { cat("Ignoring the provided raw_str_map.json fie. The conversion will use the existing raw_str_map from the extraction_dir"); } raw_str_map.file = paste0(str.dir,"raw_str_map.json") # read raw.rds file ---- #check if fadn.raw.rds exist fadn.raw.rds.filename = paste0(rds.dir,"fadn.raw.",fadn.year,".",fadn.country,".rds") if(!file.exists(fadn.raw.rds.filename)) { cat("You have first to create a fadn.raw.rds file for the year and country (from a DG-AGRI csv file). Exiting ....\n") return(FALSE) } fadn.raw.rds = readRDS(fadn.raw.rds.filename) library("jsonlite") #read raw_str_map file to a list ---- raw_str_map = fromJSON(paste(readLines(raw_str_map.file), collapse="\n")) #create empty return list data.return = list() attach(fadn.raw.rds) #o that eval works more efficiently # ....................................................... #what id variables will be availble to all DT? #create id ---- print("Doing id ...") id.vars.list = take.raw_str_map.columns(raw_str_map$id) id.dt = data.table() id.dt.descriptions = data.frame(COLUMN=character(), FORMULA=character(), DESCRIPTION = character()); #intersting links # # https://stackoverflow.com/questions/28327738/evaluate-expression-in-r-data-table start.time <- Sys.time() for(k in names(id.vars.list)) { if(DEBUG) { print(paste0(" doing ", k, " = ", id.vars.list[[k]][["SOURCE"]])); } #approach 1: 19 sec # f <- function(e, .SD) eval(parse(text=e[1]), envir=.SD) # id.dt.cur = fadn.raw.rds[,list( # k=f(id.vars.list[[k]][["SOURCE"]], # .SD) # )] # id.dt.cur = fadn.raw.rds[,list(k=f(id.vars.list[[k]][["SOURCE"]],.SD))] #approach 0: 10 sec #id.dt.cur = fadn.raw.rds[,list(k=eval(parse(text=id.vars.list[[k]][["SOURCE"]])))] #approach 2: 0.3 sec expr = eval(parse(text=id.vars.list[[k]][["SOURCE"]])) id.dt.cur = fadn.raw.rds[,list(k=expr)] setnames(id.dt.cur,k) id.dt=cbind(id.dt,id.dt.cur) id.dt.descriptions = rbind(id.dt.descriptions, data.frame(COLUMN=k, FORMULA=id.vars.list[[k]][["SOURCE"]], DESCRIPTION = id.vars.list[[k]][["DESCRIPTION"]]) ) } attr(id.dt,"column.descriptions") <- id.dt.descriptions; end.time <- Sys.time() time.taken <- end.time - start.time time.taken # ....................................................... #create info ---- print("Doing info ...") info = copy(id.dt) info.descriptions = data.frame(COLUMN=character(), FORMULA=character(), DESCRIPTION = character()); ##now for each info key, add the column info.map = take.raw_str_map.columns(raw_str_map$info) for(k in names(info.map)) { if(DEBUG) { print(paste0(" doing ", k, " = ", info.map[[k]][["SOURCE"]])); } #info.cur = fadn.raw.rds[,list(k=eval(parse(text=info.map[[k]][["SOURCE"]])))] expr = eval(parse(text=info.map[[k]][["SOURCE"]])) info.cur = fadn.raw.rds[,list(k=expr)] #fadn.raw.rds[,list(k=eval(parse(text=info.map[[k]])))] setnames(info.cur,names(info.cur),k) info=cbind(info,info.cur) info.descriptions = rbind(info.descriptions, data.frame(COLUMN=k, FORMULA=info.map[[k]][["SOURCE"]], DESCRIPTION = info.map[[k]][["DESCRIPTION"]]) ) } attr(info,"column.descriptions") <- rbind( attr(info,"column.descriptions"), info.descriptions ) data.return$info = info; # # ....................................................... # #create livestock-animals ---- # # if(DEBUG){cat("\n")} # print("Doing livestock-animals ...") # # ##now load the map # lvst.animals.map = raw_str_map$livestock$animals # # ##if not empty # if(length(names(lvst.animals.map))>0) { # lvst.animals.id = copy(id.dt) # lvst.animals.descriptions = data.frame( LIVESTOCK=character(), DESCRIPTION=character() ); # lvst.animals.column.formulas = data.frame( LIVESTOCK=character(), COLUMN=character(), FORMULA=character()); # # lvst.animals = NULL #this is the DT to save the data # # # for(lvst.animal.key in names(lvst.animals.map)) { #loop each lvst.animals name (e.g. DCOW, SCOW, etc.) # # if(DEBUG) { print(paste0(" ", lvst.animal.key)); } else { cat(".")} # # lvst.animal.key.map = lvst.animals.map[[lvst.animal.key]] # # lvst.animals.descriptions = rbind(lvst.animals.descriptions, # data.frame( LIVESTOCK=lvst.animal.key, # DESCRIPTION=lvst.animal.key.map[["description"]] ) ) # # for(k in names(lvst.animal.key.map[["columns"]])) { #loop each key within the animal name # # cmd = parse(text=(lvst.animal.key.map[["columns"]][[k]])) # if(DEBUG) { print(paste0(" running ",k," = ", cmd)); } # # #lvst.animals.cur = fadn.raw.rds[,list(k=eval(parse(text=(lvst.animal.key.map[["columns"]][[k]]) )))] # expr = eval(parse(text=(lvst.animal.key.map[["columns"]][[k]]) )) # lvst.animals.cur = fadn.raw.rds[,list(k=expr)] # # # setnames(lvst.animals.cur,names(lvst.animals.cur),"VALUE") # # lvst.animals.column.formulas = rbind(lvst.animals.column.formulas, # data.frame(LIVESTOCK= lvst.animal.key, COLUMN=k , FORMULA=lvst.animal.key.map[["columns"]][[k]]) # ); # # if(is.null(lvst.animals)) { # lvst.animals = cbind(lvst.animals.id,LIVESTOCK=lvst.animal.key,VARIABLE=k,lvst.animals.cur) # } # else{ # lvst.animals=rbind(lvst.animals[VALUE!=0], # cbind(lvst.animals.id,LIVESTOCK=lvst.animal.key,VARIABLE=k,lvst.animals.cur) # ) # } # # } # # } # # lvst.animals$VARIABLE = factor(lvst.animals$VARIABLE) # attr(lvst.animals,"column.formulas") <- lvst.animals.column.formulas # attr(lvst.animals,"livestock.descriptions") <- lvst.animals.descriptions # data.return$lvst$animals = lvst.animals[VALUE!=0]; # # if(!DEBUG){cat("\n")} # # } # # # # # ....................................................... # #create livestock-products ---- # # if(DEBUG){cat("\n")} # print("Doing livestock-products ...") # # ##now load the map # lvst.animals.map = raw_str_map$livestock$products # # ##if not empty # if(length(names(lvst.animals.map))>0) { # lvst.animals.id = copy(id.dt) # lvst.animals.descriptions = data.frame( LIVESTOCK=character(), DESCRIPTION=character() ); # lvst.animals.column.formulas = data.frame( LIVESTOCK=character(), COLUMN=character(), FORMULA=character()); # # lvst.animals = NULL #this is the DT to save the data # # # for(lvst.animal.key in names(lvst.animals.map)) { #loop each lvst.animals name (e.g. DCOW, SCOW, etc.) # # if(DEBUG) { print(paste0(" ", lvst.animal.key)); } else { cat(".")} # # lvst.animal.key.map = lvst.animals.map[[lvst.animal.key]] # # lvst.animals.descriptions = rbind(lvst.animals.descriptions, # data.frame( LIVESTOCK=lvst.animal.key, # DESCRIPTION=lvst.animal.key.map[["description"]] ) ) # # for(k in names(lvst.animal.key.map[["columns"]])) { #loop each key within the animal name # # cmd = parse(text=(lvst.animal.key.map[["columns"]][[k]])) # if(DEBUG) { print(paste0(" running ",k," = ", cmd)); } # # #lvst.animals.cur = fadn.raw.rds[,list(k=eval(parse(text=(lvst.animal.key.map[["columns"]][[k]]) )))] # expr = eval(parse(text=(lvst.animal.key.map[["columns"]][[k]]) )) # lvst.animals.cur = fadn.raw.rds[,list(k=expr)] # # # setnames(lvst.animals.cur,names(lvst.animals.cur),"VALUE") # # lvst.animals.column.formulas = rbind(lvst.animals.column.formulas, # data.frame(LIVESTOCK= lvst.animal.key, COLUMN=k , FORMULA=lvst.animal.key.map[["columns"]][[k]]) # ); # # if(is.null(lvst.animals)) { # lvst.animals = cbind(lvst.animals.id,LIVESTOCK=lvst.animal.key,VARIABLE=k,lvst.animals.cur) # } # else{ # lvst.animals=rbind(lvst.animals[VALUE!=0], # cbind(lvst.animals.id,LIVESTOCK=lvst.animal.key,VARIABLE=k,lvst.animals.cur) # ) # } # # } # # } # # lvst.animals$VARIABLE = factor(lvst.animals$VARIABLE) # attr(lvst.animals,"column.formulas") <- lvst.animals.column.formulas # attr(lvst.animals,"livestock.descriptions") <- lvst.animals.descriptions # data.return$lvst$products = lvst.animals[VALUE!=0]; # # if(!DEBUG){cat("\n")} # # } # ....................................................... #create crops ---- print("Doing crops ...") ##now load the map crops.map = raw_str_map$crops ##if not empty if(length(names(crops.map))>0) { crops.id = copy(id.dt) crops.descriptions = data.frame( CROP=character(), DESCRIPTION=character() ); crops.column.formulas = data.frame( CROP=character(), COLUMN=character(), FORMULA=character()); crops = NULL #this is the DT to save the data for(crop.key in names(crops.map)) { #loop each crop name (e.g. DWHE, BARL, etc.) if(DEBUG) { print(paste0(" ", crop.key)); } else { cat(".")} crop.key.map = crops.map[[crop.key]] crops.descriptions = rbind(crops.descriptions, data.frame( CROP=crop.key, DESCRIPTION=crop.key.map[["description"]] ) ) for(k in names(crop.key.map[["columns"]])) { #loop each key within the crop name cmd = parse(text=(crop.key.map[["columns"]][[k]])) if(DEBUG) { print(paste0(" running ",k," = ", cmd)); } #crops.cur = fadn.raw.rds[,list(k=eval(parse(text=(crop.key.map[["columns"]][[k]]) )))] expr = eval(parse(text=(crop.key.map[["columns"]][[k]]) )) crops.cur = fadn.raw.rds[,list(k=expr)] setnames(crops.cur,names(crops.cur),"VALUE") crops.column.formulas = rbindlist( list(crops.column.formulas, data.frame(CROP= crop.key, COLUMN=k , FORMULA=crop.key.map[["columns"]][[k]]) ) ); if(is.null(crops)) { crops = cbind(crops.id,CROP=crop.key,VARIABLE=k,crops.cur) } else{ crops=rbind(crops[VALUE!=0], cbind(crops.id,CROP=crop.key,VARIABLE=k,crops.cur) ) } } } crops$VARIABLE = factor(crops$VARIABLE) attr(crops,"column.formulas") <- crops.column.formulas attr(crops,"crops.descriptions") <- crops.descriptions data.return$crops = crops[VALUE!=0]; } cat("\n") detach(fadn.raw.rds) # ....................................................... #save and return ---- data.name = paste0("fadn.str.",fadn.year,".",fadn.country,".rds") if(file.exists(data.name)) {cat("Data exists. Overwriting ...\n")} saveRDS(data.return,paste0(rds.dir,str.name,"/",data.name)) #copy the rds return(invisible(TRUE)) }