GitLab at IIASA

yields.R 8.21 KiB
Newer Older
# -------------------------------------
# Script: yields.R
# Author: Hugo Scherer
# Purpose: Yield data preparation
# Version: 0.6.0
# Date: 06.10.2023
# Notes:
# Originally in arable_farmdata.R
#
# Copyright(c) 2023 Wageningen Economic Research
# -------------------------------------

yields_gen <- function(FADN, FADN2, reporting = FALSE) {
    message("Preparing yields data")
    # FADN2Dyn filtering

    FADN2Dyn <- FADN2Dyn[FADN2Dyn$crop_new %in% german_crops, ]

    # Select only what's needed (FarmDyn variable and FADN variables)

    # Subset FADN variables on area in ha of crops that map to FarmDyn crops
    croparea <- tibble(FADN2)[colnames(tibble(FADN2)) %in% c(area, "ID") & !colnames(tibble(FADN2)) %in% dis_crops]

    croparea2 <- FADN[, (intersect(c("ID", area), colnames(FADN))), with = FALSE]

    # Group by ID and sum all crop area variables into one column
    setDT(croparea)
    croparea$croparea <- base::Reduce(`+`, croparea[, -80])
    croparea <- croparea[!duplicated(croparea)]
    croparea$ID <- factor(croparea$ID)

    # Same as above but with production of crops in t
    cropprod <- FADN2[, (intersect(c("ID", production), colnames(FADN))), with = FALSE]

    # Same as above but with sales value of crops in EUR
    cropsalesvalue <- FADN2[, intersect(c(sales_value, "ID"), colnames(FADN)), with = FALSE]

    # Same as above but with sales quantity of crops in t
    cropsalesquant <- FADN2[, intersect(c(sales_quant, "ID"), colnames(FADN)), with = FALSE]

    crop_sales_quant <- FADN2[, intersect(c(FADN2Dyn$cropsalesquant, "ID"), colnames(FADN)), with = FALSE]

    # Pivotting to long because if the operations are done in wide format it will produce duplicates that will be corrected with '.1' at the end (or another nr depending on how many times it is duplicated)
    # with added benefits that the aprod/aarea are also in long format and that when pivotting to wide again a sum function can be added, so everything that belongs to the same FarmDyn crop category can be summed up

    colnames(aprod)[2] <- "item1"

    # Some columns are integers so it will throw a warning
    cropprod <- data.table::melt(
        cropprod,
        id.vars = "ID", variable.name = "item1", value.name = "prod"
    )

    cropprod <- merge(cropprod, aprod)

    colnames(aarea)[2] <- "item1"

    croparea <- melt(
        croparea,
        id.vars = "ID", variable.name = "item1", value.name = "area"
    )

    croparea <- merge(croparea, aarea)

    areacrop <- dcast(
        croparea, ID ~ crop_new,
        value.var = "area", fun.aggregate = sum
    )

    # operation above in data.table
    areacrop <- areacrop[, (setdiff(colnames(areacrop), "ID")) := lapply(.SD, function(x) ifelse(x < 1, 0, x)), .SDcols = (setdiff(colnames(areacrop), "ID"))]


    # Croparea2 is used for the calculation of the area of the
    # selected farms, croparea instead is used for
    # the calculation of yields because it contains ALL obs

    croparea2 <- melt(
        croparea2,
        id.vars = "ID", variable.name = "item1", value.name = "area"
    )

    colnames(aarea)[2] <- "item1"

    croparea2 <- merge(croparea2, aarea)

    areacrop2 <- dcast(
        croparea2, ID ~ crop_new,
        value.var = "area", fun.aggregate = sum
    )

    # Removing those crops with less than 1 ha
    setDT(areacrop2)
    areacrop2 <- areacrop2[, setdiff(colnames(areacrop2), "ID") := lapply(.SD, function(x) ifelse(x < 1, 0, x)), .SDcols = setdiff(colnames(areacrop2), "ID")]

    crop_sales_quant <- melt(
        crop_sales_quant,
        id.vars = "ID", variable.name = "item1", value.name = "sales_quant"
    )

    colnames(a_salesquant)[2] <- "item1"
    crop_sales_quant <- as.data.table(merge(a_salesquant, crop_sales_quant))


    crop_sales_quant <- dcast(
        crop_sales_quant, ID ~ crop_new,
        value.var = "sales_quant", fun.aggregate = sum
    )

    # Division of production over area to get yields (t) per ha
    setDT(cropprod)
    cropprod[, prod := sum(prod), by = list(ID, crop_new)]
    croparea[, area := sum(area), by = list(ID, crop_new)]
    cropprod <- merge(
        cropprod, croparea[, c("ID", "crop_new", "area")],
        by = c("ID", "crop_new")
    )

    cropprod[, yields := prod / area]

    # Condititonal to check if FADNcrops exists
    if (!file.exists(file.path("data", "temp", "yields", "FADNcrops_NUTS0.csv"))) {
        FADNcrops <- cropprod[-c(1, 3)] %>%
            pivot_wider(
                id_cols = "ID",
                names_from = "crop_new",
                values_from = "yields",
                values_fn = ~ sum(., na.rm = TRUE)
            )

        # Division by 0 problem,
        # logic tells us that if you have 0 production of something
        # over 0 hectares then that is going to be 0.
        FADNcrops[FADNcrops == Inf] <- NA

        names(FADNcrops) <- c("ID", paste0("yields%", names(FADNcrops[-1])))

        # Add NUTS level and SYS02 to FADNcrops from FADN2

        FADNcrops <- tibble(FADNcrops[, colSums(is.na(FADNcrops)) < nrow(FADNcrops)])

        FADNcrops <- merge(FADNcrops, FADN2[, c("ID", "NUTS0", "NUTS1", "NUTS2", "SYS02")])

        FADNcrops[FADNcrops == 0] <- NA

        # Summarize by NUTS level using weighted mean and weight SYS02
        for (i in 0:3) {
            assign(
                paste0("FADNcrops_NUTS", i),
                FADNcrops %>%
                    dplyr::group_by(
                        dplyr::across(
                            colnames(FADNcrops)[colnames(FADNcrops) %in% c(paste0("NUTS", i))]
                        )
                    ) %>%
                    select(-c(ID)) %>%
                    dplyr::summarise(dplyr::across(
                        tidyr::everything(),
                        ~ if (is.numeric(.)) {
                            weighted.mean(.,
                                na.rm = TRUE, w = as.numeric(as.character(SYS02))
                            )
                        } else {
                            Modes(.)[1]
                        }
                    ), n = dplyr::n())
            )
        }

        # Remove the n, SYS columns
        for (i in 0:3) {
            df <- get(paste0("FADNcrops_NUTS", i))
            df <- df[, -which(colnames(df) %in% c("n", "SYS02"))]
            assign(paste0("FADNcrops_NUTS", i), df)
        }

        # Impute with mean of column for NUTS0
        FADNcrops_NUTS0 <- FADNcrops_NUTS0 %>%
            dplyr::mutate(
                dplyr::across(dplyr::where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .))
            )

        # Impute with mean of column for each nuts level
        FADNcrops_NUTS1 <- dplyr::rows_patch(
            FADNcrops_NUTS1, FADNcrops_NUTS0,
            by = "NUTS0"
        )

        FADNcrops_NUTS2 <- dplyr::rows_patch(
            FADNcrops_NUTS2, FADNcrops_NUTS1,
            by = "NUTS1"
        )

        # Write to csv
        for (i in 0:3) {
            readr::write_csv(
                get(paste0("FADNcrops_NUTS", i)),
                file.path("data", "temp", "yields", paste0("FADNcrops_NUTS", i, ".csv"))
            )
        }
    } else {
        FADNcrops_NUTS0 <- readr::read_csv(
            file.path("data", "temp", "yields", "FADNcrops_NUTS0.csv"),
            col_names = TRUE,
            show_col_types = FALSE
        )
        FADNcrops_NUTS1 <- readr::read_csv(
            file.path("data", "temp", "yields", "FADNcrops_NUTS1.csv"),
            col_names = TRUE,
            show_col_types = FALSE
        )
        FADNcrops_NUTS2 <- readr::read_csv(
            file.path("data", "temp", "yields", "FADNcrops_NUTS2.csv"),
            col_names = TRUE,
            show_col_types = FALSE
        )
    }

    # Create descriptive statistics of yields for NUTS0
    # Remove the "yields%" from the column names
    if (reporting == TRUE) {
        crops_rep <- FADNcrops_NUTS0 %>%
            left_join(defaultvalsNUTS0, by = "NUTS0")
        colnames(crops_rep) <- gsub("yields%", "", colnames(crops_rep))
        crops_rep <- crops_rep %>%
            select(
                "NUTS0", "MaizCorn", "MaizSil", "SummerCere", "summerBeans",
                "Winterbarley", "SugarBeet", "WinterRape", "WinterWheat", "Potatoes",
                "gra1", "gra2", "gra3", "gra4"
            )
    }
    write.csv(crops_rep, file.path("temp", "crops_rep.csv"), row.names = FALSE)
}