# Outlier detection ----
# Price and yield outliers for crops and livestock
# Outliers in crop and animal production

library(hrbrthemes)
library(viridis)

# 1) replacement NA to 0 ---- 

crops_dat.long <- crops_dat %>% pivot_longer(cols = EAAP:PRICE_final, names_to = "Variable", values_to = "VALUE")

knitr::kable(
  dcast(
    (crops_dat.long %>% as.data.table() %>% 
       filter(Variable %in% c("YIELD","PRICE", "PRICE2", "PRICE_final")))[,.(Variable,YEAR,variable="ISNA",value=ifelse(is.na(VALUE),1,0))],
    Variable+YEAR~variable,fun.aggregate = sum) %>% dcast( Variable ~ YEAR ,value.var = "ISNA"),
  caption="Number of Missing Values (NA) per Variable and year"
)

crops_dat.long.NAto0 <-crops_dat.long %>%  replace(is.na(.),0)

# 


# yield -------------------------------------------------------------------
yield <- crops_dat.long.NAto0 %>% filter(Variable == "YIELD") 

average.yield <-  gdx.dat %>% filter(variable == "Yield") 
# calculate the interquartile ranges 
# outlier
is_outlier <- function (x){
  return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
}

remove_outlier <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  H <- 1.5 * IQR(x, na.rm = na.rm)
  y <- x
  y[x < (qnt[1] - H)] <- NA
  y[x > (qnt[2] + H)] <- NA
  y
}


calc_boxplot_stat <- function(x) {
  coef <- 1.5
  n <- sum(!is.na(x))
  # calculate quantiles
  stats <- quantile(x, probs = c(0.0, 0.25, 0.5, 0.75, 1.0))
  names(stats) <- c("ymin", "lower", "middle", "upper", "ymax")
  iqr <- diff(stats[c(2, 4)])
  # set whiskers
  outliers <- x < (stats[2] - coef * iqr) | x > (stats[4] + coef * iqr)
  if (any(outliers)) {
    stats[c(1, 5)] <- range(c(stats[2:4], x[!outliers]), na.rm = TRUE)
  }
  return(stats)
}

#'1: IQR outlier detection by organic/non-org, year ,country and crop
#'2: boxplot and geom_violin together

org.crops <- crops_dat.long.NAto0 %>%  
  mutate(ORGANIC=case_when(
    ORGANIC=="org-2" ~ "Organic",
    TRUE ~ "Conventional"))%>%
  select(-(TF8:SIZ6),-WEIGHT, -ALTITUDE)



yield.dir = paste0(CurrentProjectDirectory,"/plots/outlier/yield")
dir.create(yield.dir)
all.countires <- org.crops$COUNTRY %>% unique()
# *Plotting ----
# plotting the data and visually inspecting for any data points that fall outside the normal distribution or pattern

for (country in all.countires ){
  dir.create(paste0(yield.dir,"/", country))
  dat <- org.crops %>% filter(COUNTRY == country &
                                Variable == "YIELD")
  crops = dat$CROP %>% unique()
  for (crop_ele in crops) {
    # original data 
    y_p <- dat %>% filter(CROP == crop_ele) %>%
      ggplot(aes(x = factor(YEAR), y = VALUE, fill = ORGANIC)) +
      geom_violin(width = 1.2) +
      geom_boxplot(width = 0.1,
                   color = "white",
                   alpha = 0.6) +
      facet_wrap(. ~ ORGANIC, scales = "free_y") +
      scale_fill_viridis(discrete = TRUE) +
      theme_ipsum() +
      theme(legend.position = "none",
            plot.title = element_text(size = 11)) +
      ggtitle(crop_ele) +
      xlab("Year") +
      ylab("Yield")+ 
      theme(axis.text.x = element_text(
        angle = 90,
        vjust = 0.5,
        hjust = 1
      ))
    
    ggsave(plot = y_p ,filename = paste0(yield.dir,"/",country,"/",crop_ele, "_yield.png"),
           width = 18, height = 8)
    
    
    # remove outlier plots
    
    y_p1 <- dat %>% filter(CROP == crop_ele) %>%
      ggplot(aes(x = factor(YEAR), y = VALUE, fill = ORGANIC)) +
      stat_summary(fun.data = calc_boxplot_stat, geom = "boxplot") +
      geom_violin(width = 1.2) +
      geom_boxplot(width = 0.1,
                   color = "white",
                   alpha = 0.6) +
      facet_wrap( .~ORGANIC, scales = "free") +
      scale_fill_viridis(discrete = TRUE) +
      theme_ipsum() +
      theme(legend.position = "none",
            plot.title = element_text(size = 11)) +
      ggtitle("Yield without outliers of ", crop_ele) +
      xlab("Year") +
      ylab("Yield")+ 
      theme(axis.text.x = element_text(
        angle = 90,
        vjust = 0.5,
        hjust = 1
      ))
    
    ggsave(plot = y_p1 ,filename = paste0(yield.dir,"/",country,"/",crop_ele, "_yield_remove_outlier.png"),
           width = 18, height = 8)
  }
}



# * yield based on IQR (interquartile range)  ---------------------------------------------------





yield.iqr <- yield  %>% select(-c(TF8:SIZ6, NUTS1:NUTS3, ALTITUDE)) %>%
  mutate(outlier = ifelse(is_outlier(VALUE), TRUE, FALSE), 
         outlier_value = ifelse(is_outlier(VALUE), VALUE, as.numeric(NA)) )

# plot 
#Options
options(scipen=100, digits=2)

# p1 <- yield.iqr %>% filter(COUNTRY %in% "DEU" & CROP %in% c("APPL","FLOW")) %>%
#   ggplot(aes(x = factor(YEAR),y = VALUE, group = 1)) +
#   geom_boxplot()+
#   # geom_point(aes(x=factor(YEAR),y=outlier_value),color="red",shape=3,size=.5) +
#   facet_wrap(.~CROP, nrow = 5, scales = "free")+
#   geom_text(aes(label = outlier_value),color ="red", 
#             na.rm = TRUE,
#             hjust = -0.1) +
#   theme_light(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# print(p1)


# 
# yield.iqr %>% filter(COUNTRY %in% "DEU" )%>%
#   ggplot(aes(x = factor(YEAR),
#              y = VALUE)) +
#   stat_summary(fun.data = calc_boxplot_stat, geom = "boxplot") +
#   facet_wrap(. ~ CROP, scales = "free_y")+
#   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))




# * average yield based on IQR ----------------------------------------------


average.yield$REG_TYPE %>% unique()