Newer
Older
Xinxin Yang
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#' load coco tables and dimdefs from capri directory
#' @param xml.dir xml directory
#' @export
#'
#'
#'
#'
#'
#'
xml_data <- function(xml.dir) {
# coco
coco.xml <- paste0(xml.dir, "/coco_tables.xml")
coco_tables<- create_df_from_xml(coco.xml,"//table") %>% as_tibble()
coco_tables$regionSel <- gsub('Regions and Countries', 'Regions and C', coco$regionSel, ignore.case = TRUE)
# dimdef: dimdef_new
dimdef.xml <- paste0(xml.dir, "/dimdefs_new.xml")
dimdef<- ldply(xmlToList(dimdef.xml), data.frame)
dimdef$.id <- factor(dimdef$.id) %>% as_tibble()
dimdef_product<- dimdef %>% filter(.id == "product")
dimdef_dim5<- dimdef %>% filter(.id == "dim5")
dimdef_region<- dimdef %>% filter(.id == "region")
dimdef_selgroup<- dimdef %>% filter(.id == "selgroup")
dimdef_activity<- dimdef %>% filter(.id == "activity")
# dimdef_product<- create_df_from_xml(dimdef.xml,"//product") %>% as_tibble()
# dimdef_dim5<- create_df_from_xml(dimdef.xml,"//dim5") %>% as_tibble()
# dimdef_region<- create_df_from_xml(dimdef.xml,"//region") %>% as_tibble()
# dimdef_selgroup<- create_df_from_xml(dimdef.xml,"//selgroup") %>% as_tibble()
# dimdef_activity<- create_df_from_xml(dimdef.xml,"//activity") %>% as_tibble()
if(!dir.exists("data")) {
dir.create("data")
}
save(coco_tables, file = "data/coco_tables.rds")
save(dimdef_product, file = "data/dimdef_product.rds")
save(dimdef_dim5, file = "data/dimdef_dim5.rds")
save(dimdef_region, file = "data/dimdef_region.rds")
save(dimdef_activity, file = "data/dimdef_activity.rds")
print(paste0("extracted xml data is saved in ", getwd(), "/data"))
}
#' convert xml to df
#' @param xml xml file
#' @return df
#'
#'
#'
#'
#'
create_df_from_xml <- function(xml, char) {
library(xml2)
library(plyr)
library(dplyr)
dat <- read_xml(as.character(xml))
dat_list <- dat %>% xml_find_all(char) %>% as_list()
dat_nodePaths <- dat %>% xml_find_all(char)
dat_paths <- xml_path(dat_nodePaths)
tst <- lapply(dat_list, function(x) {unlist(x)})
tst2 <- sapply(tst, function(x){rbind(unlist(x))})
df <- ldply(tst2,data.frame)
return(df)
}