#outlier detection
#

# Load data ---------------------------------------------------------------

# raw data
raw.data <- load.fadn.raw.rds(countries = "all")
# load str crops data
fadn.str <- load.fadn.str.rds(extraction_dir = "test")
fadn.str.crops <- fadn.str$crops

fadn.str.crops %>% count(VARIABLE)
fadn.str.crops %>% count(CROP)
# filter "LEVL"

# LEVL ---------------------------------------------------------------
fadn.str.crops.levl <- fadn.str.crops %>% filter(VARIABLE == "LEVL") %>%
  mutate(ORGANIC=case_when(
    ORGANIC=="org-2" ~ "Organic",
    TRUE ~ "Conventional"))%>%
  mutate(value2 = WEIGHT*VALUE) %>%
  select(-(TF8:SIZ6),-WEIGHT, -ALTITUDE)
# table: number of obervations per crop
knitr::kable(
  caption="Number of observations per CROP",
  dcast(fadn.str.crops.levl[,.N,by=list(YEAR,CROP)],CROP~YEAR,value.var = "N")
)
# SETA is missing

# GROF --------------------------------------------------------------------
fadn.str.crops.grof <- fadn.str.crops %>% filter(VARIABLE == "GROF") %>%
  mutate(ORGANIC=case_when(
    ORGANIC=="org-2" ~ "Organic",
    TRUE ~ "Conventional"))%>%
  mutate(value2 = WEIGHT*VALUE) %>%
  select(-(TF8:SIZ6),-WEIGHT, -ALTITUDE)

knitr::kable(
  caption="Number of observations per CROP",
  dcast(fadn.str.crops.grof[,.N,by=list(YEAR,CROP)],CROP~YEAR,value.var = "N")
)


# EAAP ---------------------------------------------------------------
fadn.str.crops.eaap <- fadn.str.crops %>% filter(VARIABLE == "EAAP") %>%
  mutate(ORGANIC=case_when(
    ORGANIC=="org-2" ~ "Organic",
    TRUE ~ "Conventional"))%>%
  mutate(value2 = WEIGHT*VALUE) %>%
  select(-(TF8:SIZ6),-WEIGHT, -ALTITUDE)

knitr::kable(
  caption="Number of observations per CROP",
  dcast(fadn.str.crops.eaap[,.N,by=list(YEAR,CROP)],CROP~YEAR,value.var = "N")
)


#puls: 2013 before PULS,

#      2014 after PULS_lntls, PULS_oth, PULS_pea
# dcast(fadn.str.crops.levl[,value2,by=list(YEAR,CROP)],CROP~YEAR,value.var = "value2",fun.aggregate = sum) %>% as_tibble()%>% filter( grepl("PULS",CROP) )

# fadn.str.crops.levl  %>%
#   mutate(CROP = case_when(
#     YEAR %in% c(2014:2018) & CROP %in% c("PULS_lntls", "PULS_oth", "PULS_pea") ~ "PULS",
#     TRUE ~ CROP )) %>%
#   group_by(CROP,YEAR) %>%
#   summarise(sum_ = sum(value2)) %>% filter ( !CROP %in% c("PULS_lntls", "PULS_oth", "PULS_pea")) %>%
#   pivot_wider(names_from = YEAR, values_from = sum_)
#   filter(grepl("PULS",CROP))

# Correct PULS: json file
# 2013 before: PULS,
# 2014 after: sum(PULS_lntls, PULS_oth, PULS_pea)=PULS
# fadn.str.crops.levl <- fadn.str.crops.levl  %>%
#     mutate(CROP = case_when(
#       YEAR %in% c(2014:2018) &
#         CROP %in% c("PULS_lntls", "PULS_oth", "PULS_pea") ~ "PULS",
#       TRUE ~ CROP
#     ))
# check coherence of PULS
# fadn.str.crops.levl %>%
#     filter(grepl("PULS", CROP)) %>%
#     mutate(CROP = case_when(CROP != "PULS" ~ "PULS_new", TRUE ~ "PULS")) %>%
#     group_by(YEAR, CROP) %>%
#     summarise(sum_ = sum(value2)) %>%
#     pivot_wider(names_from = CROP, values_from = sum_) %>%
#     mutate(diff_PULS = PULS_new-PULS)


# Number of farms per CROP -----
dcast(fadn.str.crops.levl[, value2, by = list(YEAR,CROP)],
        CROP ~ YEAR,
        value.var = "value2",
        fun.aggregate = sum)

# Plot of data and outliers -----------------------------------------------

# boxplot

# outlier
is_outlier <- function (x){
  return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
}


remove_outlier <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  H <- 1.5 * IQR(x, na.rm = na.rm)
  y <- x
  y[x < (qnt[1] - H)] <- NA
  y[x > (qnt[2] + H)] <- NA
  y
}

# without outlier
calc_boxplot_stat <- function(x) {
  coef <- 1.5
  n <- sum(!is.na(x))
  # calculate quantiles
  stats <- quantile(x, probs = c(0.0, 0.25, 0.5, 0.75, 1.0))
  names(stats) <- c("ymin", "lower", "middle", "upper", "ymax")
  iqr <- diff(stats[c(2, 4)])
  # set whiskers
  outliers <- x < (stats[2] - coef * iqr) | x > (stats[4] + coef * iqr)
  if (any(outliers)) {
    stats[c(1, 5)] <- range(c(stats[2:4], x[!outliers]), na.rm = TRUE)
  }
  return(stats)
}

countries <- fadn.str.crops.levl$COUNTRY %>% unique()
boxplot_crop <- function() {
  for (country in countries) {
    if (file.exists(paste0(CurrentProjectDirectory,"/plots/outlier/",country,"/"))) {

      cat("The folder already exists")

    } else {

      dir.create(paste0(CurrentProjectDirectory,"/plots/outlier/",country,"/"))

    }
    data <- fadn.str.crops.levl %>% filter(COUNTRY == country)

    crops <-  data$CROP %>% unique()

    for (filter.crop in crops) {
      # with outlier
      options(digits = 2)
      p1 <- data %>% filter(CROP %in% filter.crop) %>%
        mutate(outlier = ifelse(is_outlier(value2), value2, as.numeric(NA))) %>%
        ggplot(aes(x = factor(YEAR),
                   y = value2,
                   fill = ORGANIC)) +
        geom_boxplot() +
        # facet_wrap(ORGANIC~CROP, nrow = 1, scales = "free")+
        facet_grid(ORGANIC ~ CROP, scales = "free_y") +
        geom_text(aes(label = outlier),
                  na.rm = TRUE,
                  hjust = -0.1) +
        theme_light()

      p2 <-data %>% filter(CROP %in% filter.crop) %>% ggplot(aes(x = factor(YEAR),
                                                              y = value2,
                                                              fill = ORGANIC)) +
        stat_summary(fun.data = calc_boxplot_stat, geom = "boxplot") +
        facet_grid(ORGANIC ~ CROP, scales = "free_y")


      ggsave(plot = p1 ,filename = paste0(CurrentProjectDirectory,"/plots/outlier/",country,"/",filter.crop,"_outlier.png"),
        width = 18, height = 8)
      ggsave(plot = p2 ,filename = paste0(CurrentProjectDirectory,"/plots/outlier/",country,"/",filter.crop,".png"),
             width = 18, height = 8)
    }
  }


}

boxplot_crop()