GitLab at IIASA

convert_data.R 20.8 KiB
Newer Older
Xinxin Yang's avatar
Xinxin Yang committed




#' Imports a DG-AGRI csv into fadnUtils
#'
#' It first call the convert.to.fadn.raw.rds and then convert.to.fadn.str.rds
#'
#' @param file.path the full path of the file (the filename must be included)
#' @param raw.f the raw_str_map file to use. it must reside inside 'raw_str_maps; folder of the data.dir
#' @param sepS the separator of the csv files (by default ",")
#' @param fadn.year the year the csv files refers to (e.g. 2001)
#' @param fadn.country the three letter country code the csv files refers to (e.g. "ELL")
#' @param keep.csv if TRUE, copy the csv files; else do not copy
#'
#' @return NULL
#' @export
#'
#' @examples
import.fadn.csv <- function (file.path,
                             raw.f=NULL,
                             sepS=",",
                             fadn.year= NA,
                             fadn.country = NA,
                             keep.csv=F) {

  #if file exist
  if(!file.exists(file.path)) {
    cat(paste0("File ",file.path," does not exist. Exiting ...\n"))
    return(invisible(FALSE))
  }

  # check for fadnUtils.data.dir
  if(is.null(get.data.dir())) {
    cat("You have first to set the fadnUtils.data.dir using set.data.dir function. Exiting ....\n")
    return(FALSE)
  } else {
    data.dir = get.data.dir();
    csv.file = basename(file.path)
  }

  if(is.null(raw.f)) {
    cat("You have to give a raw_str_map. Exiting ....\n")
    return(FALSE)
  }

  if(convert.to.fadn.raw.rds(file.path,sepS,fadn.year,fadn.country,keep.csv)) {
    convert.to.fadn.str.rds(fadn.country,fadn.year,raw.f)
  } else {
    cat("Failed to import. Exiting ...\n")
    return(invisible(NULL))
  }



}



#' Gets a fadn.raw.csv (csv file from DG-AGRI) and transforms it accordingly to fadn.raw.rds
#'
#' It saves two files:
#'  - One that contain a wide format of the data, i.e. in tabular format that is identical to the csv data. This is uncompressed data.
#'  - One that holds the same information in compressed data. It is a list that contains $data.char and $data.num data.tables in long format. 0 values are removed and only the col.id is the index on both data.tables
#'
#' @param file.path the full path of the csv file (the filename must be included)
#' @param sepS the separator of the csv files (by default ",")
#' @param fadn.year the year the csv files refers to (e.g. 2001)
#' @param fadn.country the three letter country code the csv files refers to (e.g. "ELL")
#' @param keep.csv if TRUE, copy the csv files to the CSV directory; else do not copy
#'
#' @return Saves the fadn.raw.rds file and returns TRUE if everything goes well
#' @import data.table
#'
#' @export
#' @examples
convert.to.fadn.raw.rds <- function(file.path="",
                           sepS=",",
                           fadn.year= NA,
                           fadn.country = NA,
                           keep.csv = F,
                           col.id = "ID") {

  library(data.table)

  #if file exist
  if(!file.exists(file.path)) {
    cat(paste0("File ",file.path," does not exist. Exiting ...\n"))
    return(FALSE)
  }

  # check for fadnUtils.data.dir
  if(is.null(get.data.dir())) {
    cat("You have first to set the fadnUtils.data.dir using set.data.dir function. Exiting ....\n")
    return(FALSE)
  } else {
    data.dir = get.data.dir();
    csv.file = basename(file.path)
  }

  if(file.exists(paste0(data.dir,"/csv/",csv.file))) {cat("File exists. Overwriting ...\n")}

  #copy csv to data.dir/csv
  if(keep.csv) {
    print("      copying file")
    file.copy(file.path,paste0(data.dir,"/csv/",csv.file))
  }

  #convert to uncompressed rds and save
  print("      creating fadn.raw.rds")
  data.raw = data.table(read.csv(file.path,header = T, as.is = T))

  attr(data.raw,"original.file.path") <-file.path
  attr(data.raw,"fadn.year")<-fadn.year
  attr(data.raw,"fadn.country")<-fadn.country

  data.name  = paste0("fadn.raw.",fadn.year,".",fadn.country,".rds")

  saveRDS(data.raw,paste0(data.dir,"/rds/",data.name))


  # #convert to compressed rds and save
  # data.raw.classes = data.table(col.name=names(data.raw),col.class=sapply(data.raw,class))
  # data.raw.compr = list()
  #
  # char.cols = c(col.id, data.raw.classes[col.class=="character",col.name])
  # data.raw.compr$data.char = data.raw[,..char.cols]
  #
  # num.cols = c(col.id, data.raw.classes[!col.class=="character",col.name])
  # data.raw.compr$data.num = melt(data.raw[,..num.cols],id.vars = col.id)[!value==0]
  #
  # attr(data.raw.compr,"original.file.path") <-file.path
  # attr(data.raw.compr,"fadn.year")<-fadn.year
  # attr(data.raw.compr,"fadn.country")<-fadn.country
  # attr(data.raw.compr,"col.names")<-names(data.raw)
  # attr(data.raw.compr,"col.id")<-col.id
  #
  # data.name  = paste0("fadn.raw.",fadn.year,".",fadn.country,".compressed.rds")
  # saveRDS(data.raw.compr,paste0(data.dir,"/rds/",data.name))
Xinxin Yang's avatar
Xinxin Yang committed



  return(invisible(TRUE))

}



#' Converts an fadn.raw.rds file to fadn.str.rds file using a raw_str_map.json file
#'
#' The raw_str_map.json specification is as follows:
#'
#' {
#'    "id": { "COLUMN in every list member in RDS": "COLUMN IN CSV", ....},
#'    "info": { "COLUMN in info RDS": "COLUMN IN CSV", ....},
#'    "livestock": {}
#'    "crops": {
#'               "CROP NAME 1": {"description": "description of crop name", "columns": {"VARIABLE NAME": COLUMN IN CSV", ....}   },
#'               "CROP NAME 2": {"description": "description of crop name", "columns": {"VARIABLE NAME": COLUMN IN CSV", ....}   },
#'               ....
#'    }
#' }
#'
#'
#' The structure of the str.dir:
#'  - A data.dir can hold more than one extractions.
#'  - Each extraction has a short name (20 or less characters, whitespace is not allowed)
#'  - Each extraction is stored in the data.dir/rds/<extraction_name>
#'  - That folder contains the following files:
#'       + raw_str_map.json: the raw_str_map
#'       + fadn.str.<4-digit YEAR>.<3-letter COUNTRY>.rds: the extracted data
#'
#' Notes:
#' 1) The computed RDS file contains a list structure with the following keys: info, costs, livestock-animals and crops
#'    All are data.tables. For all of them, the first columns are those that are contained in the "id" object
#'    "info" and "costs" are in table format, i.e. each farm is one row and data is on columns, as defined in the
#'        related raw_str_map.json file.
#'    "crops" and "livestock-animals" are in wide data format (https://tidyr.tidyverse.org/), where one farm lies accross many rows, and each
#'        row is a farm-crop-variableName-value combination
#'
#' 2) In $id, $info and $costs, "COLUMN IN CSV" can have two forms
#'     i) a single column name in the fadn.raw csv file or a combination, e.g. "K120SA+K120FC+K120FU+K120CV-K120BV"
#'     ii) the form of an object {"source": "the column in the csv", "description": "a description of what this column is about"}
#'
#' 3) We attach certain attributes that are useful for identifying informations:
#'     i) In $info and $costs, the attribute "column description" provide information of the formula and the description of each column
#'     ii) In $crops and $livestock-animals, the attribute "$crops.descriptions" and "$livestock.descriptions", provide the description of each CROP contained there
#'     iii) In $crops and $ the attribute "$column.formulas" provide the formulas used in order to derive the VALUE
#'
#'
#'
#'
#' @param fadn.country string with the country to extract the str data
#' @param fadn.year the year to extract the structured data
#' @param raw_str_map.file the full path to the raw_str_map file.
#' @param str.short_name the short name of the str data. No spaces and text up to 20 characters
#' @param DEBUG if TRUE, prints more details on the conversion process
#'
#' @return Saves the rds.str.fadn and returns TRUE if everything goes well
#'
#' @export
#'
#' @examples
convert.to.fadn.str.rds <-function(fadn.country = NA,
                                   fadn.year= NA,
                                   raw_str_map.file=NULL,
                                   force_external_raw_str_map=FALSE,
                                   str.name = NULL,
                                   DEBUG=F
                                   )  {

  #check if str.short_name abides to specification
  if(nchar(str.name)>20){
    cat("str.name more should be 20 character and less. Exiting ....\n")
    return(invisible(FALSE))
  }
  if (grepl("\\s",str.name)) {
    cat("str.name should not contain any kind of whitespace. Exiting ....\n")
    return(invisible(FALSE))
  }


  # check for fadnUtils.data.dir
  if(is.null(get.data.dir())) {
    cat("You have first to set the fadnUtils.data.dir using set.data.dir function. Exiting ....\n")
    return(invisible(FALSE))
  }


  #if data.dir is a proper dat.dir
  if(!check.data.dir.structure()) {
    cat("data.dir does not have a proper structure. Exiting ....\n")
    return(invisible(FALSE))

  } else {
    data.dir = get.data.dir();
    rds.dir = paste0(data.dir,"/rds/")
    str.dir = paste0(rds.dir,"/",str.name,"/")

  }


  #create/get the raw_str_map.json file
  if(is.null(raw_str_map.file) & !file.exists(paste0(str.dir,"raw_str_map.json"))) {
    cat("either provide a raw_str_map or an existing extraction dir Exiting ....\n")
    return(invisible(FALSE))
  }



  #create/get the raw_str_map.json file ----

  ##case a
  if(!is.null(raw_str_map.file) & !file.exists(paste0(str.dir,"raw_str_map.json"))) {
    dir.create(str.dir)
    file.copy(raw_str_map.file,paste0(str.dir,"raw_str_map.json"))#copy the file
    cat("\n", raw_str_map.file, " copied to ", paste0(str.dir,"raw_str_map.json\n"))
  }

  ##case b
  if(!is.null(raw_str_map.file) & file.exists(paste0(str.dir,"raw_str_map.json")) & force_external_raw_str_map==T) {
    file.copy(raw_str_map.file,paste0(str.dir,"raw_str_map.json"), overwrite = TRUE)#copy the file
    cat("\n", raw_str_map.file, " copied to ", paste0(str.dir,"raw_str_map.json\n"))
  }

  ##case c
  if(!is.null(raw_str_map.file) & file.exists(paste0(str.dir,"raw_str_map.json")) & force_external_raw_str_map==F) {
    cat("Ignoring the provided raw_str_map.json fie. The conversion will use the existing raw_str_map from the extraction_dir");
  }

  raw_str_map.file = paste0(str.dir,"raw_str_map.json")





  # read raw.rds file ----
  #check if fadn.raw.rds exist
  fadn.raw.rds.filename = paste0(rds.dir,"fadn.raw.",fadn.year,".",fadn.country,".rds")

  if(!file.exists(fadn.raw.rds.filename)) {
    cat("You have first to create a fadn.raw.rds file for the year and country (from a DG-AGRI csv file). Exiting ....\n")
    return(FALSE)
  }
  fadn.raw.rds = readRDS(fadn.raw.rds.filename)


  library("jsonlite")


  #read raw_str_map file to a list ----
  raw_str_map = fromJSON(paste(readLines(raw_str_map.file), collapse="\n"))


  #create empty return list
  data.return = list()


  attach(fadn.raw.rds) #o that eval works more efficiently

  # .......................................................
  #what id variables will be availble to all DT?
  #create id ----
  print("Doing id ...")

  id.vars.list = take.raw_str_map.columns(raw_str_map$id)
  id.dt = data.table()
  id.dt.descriptions = data.frame(COLUMN=character(), FORMULA=character(), DESCRIPTION = character());

  #intersting links
  #
  # https://stackoverflow.com/questions/28327738/evaluate-expression-in-r-data-table




  start.time <- Sys.time()
  for(k in names(id.vars.list)) {

    if(DEBUG) { print(paste0("     doing  ", k, " = ", id.vars.list[[k]][["SOURCE"]])); }

    #approach 1: 19 sec
    # f <- function(e, .SD) eval(parse(text=e[1]), envir=.SD)
    # id.dt.cur = fadn.raw.rds[,list(
    #   k=f(id.vars.list[[k]][["SOURCE"]],
    #       .SD)
    # )]
    # id.dt.cur = fadn.raw.rds[,list(k=f(id.vars.list[[k]][["SOURCE"]],.SD))]

    #approach 0: 10 sec
    #id.dt.cur = fadn.raw.rds[,list(k=eval(parse(text=id.vars.list[[k]][["SOURCE"]])))]

    #approach 2: 0.3 sec
    expr = eval(parse(text=id.vars.list[[k]][["SOURCE"]]))
    id.dt.cur = fadn.raw.rds[,list(k=expr)]



    setnames(id.dt.cur,k)

    id.dt=cbind(id.dt,id.dt.cur)

    id.dt.descriptions = rbind(id.dt.descriptions,
                               data.frame(COLUMN=k,
                                          FORMULA=id.vars.list[[k]][["SOURCE"]],
                                          DESCRIPTION = id.vars.list[[k]][["DESCRIPTION"]])
    )

  }
  attr(id.dt,"column.descriptions") <- id.dt.descriptions;
  end.time <- Sys.time()
  time.taken <- end.time - start.time
  time.taken



  # .......................................................
  #create info ----
  print("Doing info ...")

  info = copy(id.dt)
  info.descriptions = data.frame(COLUMN=character(), FORMULA=character(), DESCRIPTION = character());


  ##now for each info key, add the column
  info.map = take.raw_str_map.columns(raw_str_map$info)
  for(k in names(info.map)) {

    if(DEBUG) { print(paste0("     doing  ", k, " = ", info.map[[k]][["SOURCE"]])); }

    #info.cur = fadn.raw.rds[,list(k=eval(parse(text=info.map[[k]][["SOURCE"]])))]

    expr = eval(parse(text=info.map[[k]][["SOURCE"]]))
    info.cur = fadn.raw.rds[,list(k=expr)]


      #fadn.raw.rds[,list(k=eval(parse(text=info.map[[k]])))]
    setnames(info.cur,names(info.cur),k)
    info=cbind(info,info.cur)

    info.descriptions = rbind(info.descriptions,
                               data.frame(COLUMN=k,
                                          FORMULA=info.map[[k]][["SOURCE"]],
                                          DESCRIPTION = info.map[[k]][["DESCRIPTION"]])
    )
  }

  attr(info,"column.descriptions") <- rbind( attr(info,"column.descriptions"), info.descriptions )

  data.return$info = info;




  # # .......................................................
  # #create livestock-animals ----
  #
  # if(DEBUG){cat("\n")}
  # print("Doing livestock-animals ...")
  #
  # ##now load the map
  # lvst.animals.map = raw_str_map$livestock$animals
  #
  # ##if not empty
  # if(length(names(lvst.animals.map))>0) {
  #   lvst.animals.id = copy(id.dt)
  #   lvst.animals.descriptions = data.frame( LIVESTOCK=character(), DESCRIPTION=character() );
  #   lvst.animals.column.formulas = data.frame( LIVESTOCK=character(), COLUMN=character(), FORMULA=character());
  #
  #   lvst.animals = NULL #this is the DT to save the data
  #
  #
  #   for(lvst.animal.key in names(lvst.animals.map)) { #loop each lvst.animals name (e.g. DCOW, SCOW, etc.)
  #
  #     if(DEBUG) { print(paste0("  ", lvst.animal.key)); } else { cat(".")}
  #
  #     lvst.animal.key.map = lvst.animals.map[[lvst.animal.key]]
  #
  #     lvst.animals.descriptions = rbind(lvst.animals.descriptions,
  #                                       data.frame( LIVESTOCK=lvst.animal.key,
  #                                                   DESCRIPTION=lvst.animal.key.map[["description"]] ) )
  #
  #     for(k in names(lvst.animal.key.map[["columns"]])) { #loop each key within the animal name
  #
  #       cmd = parse(text=(lvst.animal.key.map[["columns"]][[k]]))
  #       if(DEBUG) { print(paste0("     running ",k," = ", cmd)); }
  #
  #       #lvst.animals.cur = fadn.raw.rds[,list(k=eval(parse(text=(lvst.animal.key.map[["columns"]][[k]]) )))]
  #       expr = eval(parse(text=(lvst.animal.key.map[["columns"]][[k]]) ))
  #       lvst.animals.cur = fadn.raw.rds[,list(k=expr)]
  #
  #
  #       setnames(lvst.animals.cur,names(lvst.animals.cur),"VALUE")
  #
  #       lvst.animals.column.formulas = rbind(lvst.animals.column.formulas,
  #                                            data.frame(LIVESTOCK= lvst.animal.key, COLUMN=k , FORMULA=lvst.animal.key.map[["columns"]][[k]])
  #       );
  #
  #       if(is.null(lvst.animals)) {
  #         lvst.animals = cbind(lvst.animals.id,LIVESTOCK=lvst.animal.key,VARIABLE=k,lvst.animals.cur)
  #       }
  #       else{
  #         lvst.animals=rbind(lvst.animals[VALUE!=0],
  #                            cbind(lvst.animals.id,LIVESTOCK=lvst.animal.key,VARIABLE=k,lvst.animals.cur)
  #         )
  #       }
  #
  #     }
  #
  #   }
  #
  #   lvst.animals$VARIABLE = factor(lvst.animals$VARIABLE)
  #   attr(lvst.animals,"column.formulas") <- lvst.animals.column.formulas
  #   attr(lvst.animals,"livestock.descriptions") <- lvst.animals.descriptions
  #   data.return$lvst$animals = lvst.animals[VALUE!=0];
  #
  #   if(!DEBUG){cat("\n")}
  #
  # }
  #
  #
  #
  # .......................................................
  #create livestock-products ----

  if(DEBUG){cat("\n")}
  print("Doing livestock-products ...")

  ##now load the map
  lvst.animals.map = raw_str_map$livestock$production
Xinxin Yang's avatar
Xinxin Yang committed
  # print(lvst.animals.map)
  ##if not empty
  if(length(names(lvst.animals.map))>0) {
    # print('xinxin....')
    lvst.animals.id = copy(id.dt)
    lvst.animals.descriptions = data.frame( LIVESTOCK=character(), DESCRIPTION=character() );
    lvst.animals.column.formulas = data.frame( LIVESTOCK=character(), COLUMN=character(), FORMULA=character());

    lvst.animals = NULL #this is the DT to save the data


    for(lvst.animal.key in names(lvst.animals.map)) { #loop each lvst.animals name (e.g. DCOW, SCOW, etc.)

      if(DEBUG) { print(paste0("  ", lvst.animal.key)); } else { cat(".")}

      lvst.animal.key.map = lvst.animals.map[[lvst.animal.key]]

      lvst.animals.descriptions = rbind(lvst.animals.descriptions,
                                        data.frame( LIVESTOCK=lvst.animal.key,
                                                    DESCRIPTION=lvst.animal.key.map[["description"]] ) )

      for(k in names(lvst.animal.key.map[["columns"]])) { #loop each key within the animal name

        cmd = parse(text=(lvst.animal.key.map[["columns"]][[k]]))
        if(DEBUG) { print(paste0("     running ",k," = ", cmd)); }

        #lvst.animals.cur = fadn.raw.rds[,list(k=eval(parse(text=(lvst.animal.key.map[["columns"]][[k]]) )))]
        expr = eval(parse(text=(lvst.animal.key.map[["columns"]][[k]]) ))
        lvst.animals.cur = fadn.raw.rds[,list(k=expr)]


        setnames(lvst.animals.cur,names(lvst.animals.cur),"VALUE")

        lvst.animals.column.formulas = rbind(lvst.animals.column.formulas,
                                             data.frame(LIVESTOCK= lvst.animal.key, COLUMN=k , FORMULA=lvst.animal.key.map[["columns"]][[k]])
        );

        if(is.null(lvst.animals)) {
          lvst.animals = cbind(lvst.animals.id,LIVESTOCK=lvst.animal.key,VARIABLE=k,lvst.animals.cur)
        }
        else{
          lvst.animals=rbind(lvst.animals[VALUE!=0],
                             cbind(lvst.animals.id,LIVESTOCK=lvst.animal.key,VARIABLE=k,lvst.animals.cur)
          )
        }

      }

    }

    lvst.animals$VARIABLE = factor(lvst.animals$VARIABLE)
    attr(lvst.animals,"column.formulas") <- lvst.animals.column.formulas
    attr(lvst.animals,"livestock.descriptions") <- lvst.animals.descriptions
    data.return$lvst$products = lvst.animals[VALUE!=0];
    # print(data.return$lvst$products)
    if(!DEBUG){cat("\n")}

  }
Xinxin Yang's avatar
Xinxin Yang committed


  # .......................................................
  #create crops ----

  print("Doing crops ...")

  ##now load the map
  crops.map = raw_str_map$crops

  ##if not empty
  if(length(names(crops.map))>0) {
    crops.id = copy(id.dt)
    crops.descriptions = data.frame( CROP=character(), DESCRIPTION=character() );
    crops.column.formulas = data.frame( CROP=character(), COLUMN=character(), FORMULA=character());

    crops = NULL #this is the DT to save the data



    for(crop.key in names(crops.map)) { #loop each crop name (e.g. DWHE, BARL, etc.)

      if(DEBUG) { print(paste0("  ", crop.key)); } else { cat(".")}

      crop.key.map = crops.map[[crop.key]]

      crops.descriptions = rbind(crops.descriptions, data.frame( CROP=crop.key, DESCRIPTION=crop.key.map[["description"]] ) )

      for(k in names(crop.key.map[["columns"]])) { #loop each key within the crop name

        cmd = parse(text=(crop.key.map[["columns"]][[k]]))
        if(DEBUG) { print(paste0("     running ",k," = ", cmd)); }

        #crops.cur = fadn.raw.rds[,list(k=eval(parse(text=(crop.key.map[["columns"]][[k]]) )))]
        expr = eval(parse(text=(crop.key.map[["columns"]][[k]]) ))
        crops.cur = fadn.raw.rds[,list(k=expr)]


        setnames(crops.cur,names(crops.cur),"VALUE")

        crops.column.formulas = rbindlist(
          list(crops.column.formulas,
               data.frame(CROP= crop.key, COLUMN=k , FORMULA=crop.key.map[["columns"]][[k]])
          )
        );

        if(is.null(crops)) {
          crops = cbind(crops.id,CROP=crop.key,VARIABLE=k,crops.cur)
          }
        else{
          crops=rbind(crops[VALUE!=0],
                      cbind(crops.id,CROP=crop.key,VARIABLE=k,crops.cur)
          )
        }

      }

    }

    crops$VARIABLE = factor(crops$VARIABLE)
    attr(crops,"column.formulas") <- crops.column.formulas
    attr(crops,"crops.descriptions") <- crops.descriptions
    data.return$crops = crops[VALUE!=0];

  }
  cat("\n")



  detach(fadn.raw.rds)


Xinxin Yang's avatar
Xinxin Yang committed

  # .......................................................
  #save and return ----

  data.name  = paste0("fadn.str.",fadn.year,".",fadn.country,".rds")

  if(file.exists(data.name)) {cat("Data exists. Overwriting ...\n")}

  saveRDS(data.return,paste0(rds.dir,str.name,"/",data.name))

  #copy the rds

  return(invisible(TRUE))



}