#' Collect Common id
#'
#' Load the Fadn.raw.rds data (Data Table) or Fadn.str.rds data (List),
#' then collection the common id from different years on this data.
#'@param my.r.data A data object(either a data.table or a list).
#'
#'
#' @return A data.table, it includes just one column that named "common_id".
#' @export
#' @author Xinxin Yang
#'
#' @examples
#' collect.common.id(fadn.raw.rds)
#' ## collection the common "id" from the raw rds data
#' ## for 2009-2012 years and country "BEL".
#' ## Return a DT with one column named "common_id".
#'
collect.common.id <- function(my.r.data) {
if (data.class(my.r.data)=="data.table")
{
my.data = my.r.data
}
else{
cat("Tranforming ", data.class(my.r.data), " to data table....\n")
library(tidyverse)
my.data = bind_rows(my.r.data$info)
if ( names(my.data)[1] != "ID"){
setnames(my.data, "id", "ID")
}
}
#print(my.data)
#check how many years in the DT
years = unique(my.data$YEAR)
print(years)
count_year = length(years)
cat(count_year, " year(s) is/are selected.\n")
# create a empty DT
df <- data.table(YEAR = factor(), ID = factor())
# subset the DT
for (year in years){
sel = subset(my.data,YEAR==year,select = ID)
new = data.table(YEAR = year, sel)
df = rbind(df, new)
}
# count the id
count_df = df[, .(count = .N), by = ID]
#print(count_df)
# collect the id, which count = count_year
final = count_df[count_df$count==count_year]
#rename the colname
names(final)[1] = "common_id"
return(final[,1])
}
##################
# check common id
#
# step1:
# read Fadn data
# raw.rds? str.rds?
# FADN raw data: DT
# FADN str data: List id -> ID
#
# step2:
# group by year
#
# step3:
# collect the common id for different years
###################
{
"collab_server" : "",
"contents" : "",
"created" : 1618499302351.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "881151054",
"id" : "9FF2C80",
"lastKnownWriteTime" : 1618927188,
"last_content_update" : 1618927188,
"path" : "D:/public/yang/MIND_STEP/Fadn_data_.R",
"project_path" : null,
"properties" : {
"cursorPosition" : "14,40",
"scrollLine" : "15",
"tempName" : "Untitled1"
},
"relative_order" : 5,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
}
\ No newline at end of file
library(data.table)
library(stringr)
library(tidyr)
library(hablar)
#dir = "S:/Oekonomie/Transfer/gocht/FADN_DATA_2021/csv/"
dir = "D:/public/data/fadn/lieferung_20210414/csv/"
setwd(dir)
csv_files = list.files(path = dir, pattern= "*.csv$")
csv_list = data.frame(csv_files)
colnames(csv_list) = "names"
# split string into two columns at data frame based on "."
df = data.frame(do.call('rbind', strsplit(as.character(csv_list$names),'.',fixed=TRUE)))
# split data frame string into 2 columns
df = separate(df, X1, into = c("country", "Year"), sep = 3, remove = FALSE)
# count duplicates by country
table(df$country)
# count duplicates by year
table(df$Year)
countries = unique(df$country)
years = unique(df$Year)
#
df[df$country=="HRV",]
df[df$country=="POL",]
filepath = "D:/public/data/fadn/lieferung_20210414/csv/BEL2009.csv"
# for country: BEL
BEL2009.bsp <- read.csv(filepath, header=TRUE)
ncol(BEL2009.bsp)
nrow(BEL2009.bsp)
colnames(BEL2009.bsp)
# get all the ID columns
id_BEL2009 = grep("ID", colnames(BEL2009.bsp), value = TRUE)
BEL2009.bsp["ID"]
csv_files
file_list = list()
for(i in csv_files[1:10]) {
print(i)
x <- read.csv(i)
cat(ncol(x),nrow(x),'\n')
}
{
"collab_server" : "",
"contents" : "",
"created" : 1618502444757.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "586696798",
"id" : "D6E10598",
"lastKnownWriteTime" : 1618929067,
"last_content_update" : 1618929067,
"path" : "D:/public/yang/MIND_STEP/ouput_common_id.R",
"project_path" : null,
"properties" : {
"cursorPosition" : "7,1",
"scrollLine" : "8",
"source_window_id" : "",
"tempName" : "Untitled1"
},
"relative_order" : 7,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
}
\ No newline at end of file
# find all adjacent combinations in a list
myFun <- function(Data) {
A <- lapply(2:(length(Data)-1L), sequence)
B <- lapply(rev(lengths(A))-1L, function(x) c(0, sequence(x)))
unlist(lapply(seq_along(A), function(x) {
lapply(B[[x]], function(y) Data[A[[x]]+y])
}), recursive = FALSE, use.names = FALSE)
}
# load fadn raw data
# search the number of common id for adjacent combination years
output_common_id <- function(countires_list){
rds.dir = paste0(get.data.dir(),"/rds/")
# set a str name for for saving the str r-data in rds.dir
for (country in countires_list){
new.str.name = country
cat("Country:", country, '\n')
# set a extraction_dir
dir.create(paste0(rds.dir, new.str.name))
new.extraction.dir = paste0(rds.dir, new.str.name)
# count the number of the years for one country
years = df[df$country== country, ]$Year
adjacent_list = myFun(years)
adjacent_list[[length(adjacent_list)+1]] = years
my.data = list()
for (year_items in adjacent_list) {
name = toString(year_items)
data = load.fadn.raw.rds(countries = country, years = year_items)
my.data[[name]] = data
}
Big.Num.Common.id = list()
for (data_list in 1:length(my.data)){
data = my.data[data_list]
# Retrieving column names
name = names(data)
print("******************************")
#colnames(data[[name]])[which( names(data[[name]]) == "ID")] <- "id"
common.id = collect.common.id(data[[name]])
Big.Num.Common.id[[name]] = nrow(common.id)
}
DF = do.call(rbind, Big.Num.Common.id)
DF = data.frame(DF)
colnames(DF) <- "number_of_common_id"
DF$Years = rownames(DF)
# # write xlsx
# write.xlsx(DF,
# file="D:/public/yang/MIND_STEP/new_sample/DEUData_common_id.xlsx",
# sheetName = country,
# col.names= TRUE,
# row.names = TRUE,
# append = T)
}
}
output_common_id("BEL")
# convert fadn raw into str data
raw2str <- function(Current_raw_str_map.file = NULL, overwrite_external_json = F, countires_list){
for (country in countires_list){
print("**********************************")
new.str.name = country
rds.dir = paste0(get.data.dir(), "/rds")
raw_file_names = dir(rds.dir, pattern = paste0(country,".","rds$"))
unlink(paste0(rds.dir,"/", country), recursive=TRUE)
for (file in raw_file_names){
# extract first 3 char
country = substr(file, 15, 17)
# extract number
year = as.numeric(gsub("\\D+", "", file))
cat("converting the str data for country: ", country, " and year: ", year, "\n")
tryCatch(
expr = {
convert.to.fadn.str.rds(fadn.country = country,
fadn.year = year,
raw_str_map.file = Current_raw_str_map.file,
force_external_raw_str_map = overwrite_external_json,
str.name = country)
},
warning = function(w){
message('Caught an warning!')
print(w)
},
error = function(e) {
message("Caught an error! Please check the objects in json file using check.column() (see more in USE_CASE_4.R).")
#cat("Wrong, can't convert the str r-data!",sep = "\n")
print(e)
}
)
}
}
}
raw2str(Current_raw_str_map.file ="D:/public/yang/MIND_STEP/new_sample/raw_str_maps/rewrite_2014_after.json",
countires_list = countires)
{
"cursorPosition" : "130,0",
"scrollLine" : "0"
}
\ No newline at end of file
{
"cursorPosition" : "117,0",
"scrollLine" : "110"
}
\ No newline at end of file
{
"cursorPosition" : "131,15",
"last_setup_crc32" : "",
"scrollLine" : "110",
"source_window_id" : "",
"tempName" : "Untitled1"
}
\ No newline at end of file
{
}
\ No newline at end of file
{
"cursorPosition" : "10,22",
"scrollLine" : "0"
}
\ No newline at end of file
{
"cursorPosition" : "70,48",
"scrollLine" : "53",
"source_window_id" : ""
}
\ No newline at end of file
{
"cursorPosition" : "260,2",
"scrollLine" : "245"
}
\ No newline at end of file
{
"cursorPosition" : "17,13",
"scrollLine" : "3"
}
\ No newline at end of file
{
"cursorPosition" : "45,17",
"scrollLine" : "0"
}
\ No newline at end of file
{
"cursorPosition" : "87,0",
"scrollLine" : "73",
"tempName" : "Untitled1"
}
\ No newline at end of file
{
"cursorPosition" : "17,47",
"scrollLine" : "0",
"tempName" : "Untitled1"
}
\ No newline at end of file
{
"cursorPosition" : "13,0",
"scrollLine" : "12"
}
\ No newline at end of file
{
"cursorPosition" : "7,1",
"scrollLine" : "8",
"source_window_id" : "",
"tempName" : "Untitled1"
}
\ No newline at end of file
{
"cursorPosition" : "55,0",
"scrollLine" : "32"
}
\ No newline at end of file
{
"cursorPosition" : "226,15",
"scrollLine" : "203"
}
\ No newline at end of file