This post was adapted from a section in The Jigsaw Puzzle Pieces - Creating the Graphs with ggplot2, Part II of a series of posts on building Shiny App. There we introduced several ways of grouping in plotting.
As usual, before we begin, let’s load all the packages we will need.
library(dplyr)
library(stringr)
library(data.table)
library(ggplot2)
library(gridExtra)
library(grid)
## setwd()
load("~/Desktop/r sample files/survey")
head(survey)
## # A tibble: 6 x 17
## status country major Q1.1 Q1.2 Q1.3 top_reason place_options
## <fct> <fct> <fct> <chr> <chr> <chr> <chr> <chr>
## 1 Freshm… U.S. Undef… Never Never Never Find a quiet … quiet (occasio…
## 2 Freshm… China Undef… Occas… Often Often Find a quiet … crowded,focused
## 3 Freshm… U.S. Undef… Never Occa… Often Meet up with … (close to) sil…
## 4 Freshm… China Undef… Occas… Occa… Occa… Meet up with … focused,(close…
## 5 Freshm… U.S. Undef… Never Occa… Occa… Find a quiet … (close to) sil…
## 6 Freshm… China Undef… Occas… Often Often Find a quiet … (close to) sil…
## # ... with 9 more variables: space_lib <chr>, rank_crowded <dbl>,
## # rank_modpop <dbl>, rank_noisy <dbl>, rank_quiet <dbl>,
## # rank_silent <dbl>, rank_relaxed <dbl>, rank_focused <dbl>,
## # workshops <chr>
We’ll create a list of a color palette. This will make working with colors more easily later in plotting.
palette <- list(purple = c("#351F39", "#351C4D", "#6c1f55", "#765285", "#8a6899"),
turquoise = c("#709FB0", "#849974", "#A0C1B8"),
golden = c("#D1A827", "#f3da4c"))
Let’s say we want to visualize the distribution of top reason of visiting library (top_reason
) by groups. We will use this subsample as a demo to show ways of visually grouping data.
Previously we’ve created a data frame dtset
summarizing top reason of visiting library by country. We did quite some work of reshaping (copied and pasted below).
Reason <- c("Work on a class assignment/paper", "Watch video or listen audio", "Use specialized databases \\(e.g. Bloomberg, Wind\\)", "Use a library computer", "Use a group study room", "Print, photocopy, scan", "Other", "Meet up with friends", "Hang out between classes", "Get readings from Course Reserve", "Get help from a librarian", "Find a quiet place to study", "Borrow books and materials", "Attend a library workshop")
r <- data.frame(survey$country)
for (m in 1: length(Reason)){
r[,Reason[m]] <- str_extract(survey$top_reason, Reason[m])
}
dtset <- data.frame(Reason)
levels(dtset$Reason)[levels(dtset$Reason) == "Use specialized databases \\(e.g. Bloomberg, Wind\\)"] <- "Use specialized databases (e.g. Bloomberg, Wind)"
g <- c("China", "U.S.", "Other")
for (n in 1:length(g)){
dtset[,g[n]] <- apply(r[r[] == g[n], 2:15], 2, function(x) length(which(!is.na(x))))
}
dtset$Total<- rowSums(dtset[,2:4], na.rm = TRUE, dims = 1)
dtset <- dtset[order(dtset$Total,decreasing = T),]
dtset
## Reason China U.S. Other Total
## 12 Find a quiet place to study 123 47 50 220
## 6 Print, photocopy, scan 104 45 52 201
## 1 Work on a class assignment/paper 89 46 31 166
## 13 Borrow books and materials 63 15 28 106
## 10 Get readings from Course Reserve 30 6 8 44
## 4 Use a library computer 11 20 10 41
## 5 Use a group study room 32 4 5 41
## 8 Meet up with friends 23 4 9 36
## 9 Hang out between classes 16 6 5 27
## 11 Get help from a librarian 13 2 7 22
## 3 Use specialized databases (e.g. Bloomberg, Wind) 12 3 2 17
## 14 Attend a library workshop 9 5 3 17
## 2 Watch video or listen audio 7 2 2 11
## 7 Other 2 2 1 5
We will use the data for all demos below.
Let’s say we want to plot the distribution of top visits and make the contrast more visible among groups. One way to achieve the goal is to color the levels in groups. Specifically, we decide to group the frequencies as “high”, “medium” and “low”.
We first code the levels.
Reason2 <- c("Work on a class assignment/paper", "Watch video or listen audio", "Use specialized databases (e.g. Bloomberg, Wind)", "Use a library computer", "Use a group study room", "Print, photocopy, scan", "Other", "Meet up with friends", "Hang out between classes", "Get readings from Course Reserve", "Get help from a librarian", "Find a quiet place to study", "Borrow books and materials", "Attend a library workshop")
l <- data.frame(Reason = Reason2, Level = c("High","Low","Low","Medium","Medium","High","Low","Medium","Low","Medium","Low","High","High","Low"))
## label the frequency level
## the standard may be a bit arbitrary
dtset <- dtset %>% left_join(l, by = "Reason")
The rest is standard plotting.
ggplot(dtset, aes(x = reorder(Reason, Total), y = Total, fill = factor(Level, levels = c("High","Medium","Low")))) +
geom_bar(stat = "identity", alpha = 0.75) +
scale_fill_manual(values = c(palette[[1]][4], palette[[2]][1], palette[[3]][1]), name="Level of\nFrequency") +
## \n in "Level of\nFrequency" breaks the line
coord_flip() +
theme(axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12, margin = margin(0,3,0,0)),
axis.title.y = element_blank(),
axis.title.x = element_text(size = 12, margin = margin(15,0,0,0)),
axis.ticks.x = element_line(size = 0),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12),
plot.margin = unit(c(0,0,1,0), "cm"))
Now let’s say we want to plot the distribution of top visits by country. This can be done with “facets” - we will get panels within one plot for each subgroup.
To support the graphing needs, we need to reshape the dtset
data frame into long format.
dtset2 <- dtset %>% melt()
dtset2
## Reason Level variable value
## 1 Find a quiet place to study High China 123
## 2 Print, photocopy, scan High China 104
## 3 Work on a class assignment/paper High China 89
## 4 Borrow books and materials High China 63
## 5 Get readings from Course Reserve Medium China 30
## 6 Use a library computer Medium China 11
## 7 Use a group study room Medium China 32
## 8 Meet up with friends Medium China 23
## 9 Hang out between classes Low China 16
## 10 Get help from a librarian Low China 13
## 11 Use specialized databases (e.g. Bloomberg, Wind) Low China 12
## 12 Attend a library workshop Low China 9
## 13 Watch video or listen audio Low China 7
## 14 Other Low China 2
## 15 Find a quiet place to study High U.S. 47
## 16 Print, photocopy, scan High U.S. 45
## 17 Work on a class assignment/paper High U.S. 46
## 18 Borrow books and materials High U.S. 15
## 19 Get readings from Course Reserve Medium U.S. 6
## 20 Use a library computer Medium U.S. 20
## 21 Use a group study room Medium U.S. 4
## 22 Meet up with friends Medium U.S. 4
## 23 Hang out between classes Low U.S. 6
## 24 Get help from a librarian Low U.S. 2
## 25 Use specialized databases (e.g. Bloomberg, Wind) Low U.S. 3
## 26 Attend a library workshop Low U.S. 5
## 27 Watch video or listen audio Low U.S. 2
## 28 Other Low U.S. 2
## 29 Find a quiet place to study High Other 50
## 30 Print, photocopy, scan High Other 52
## 31 Work on a class assignment/paper High Other 31
## 32 Borrow books and materials High Other 28
## 33 Get readings from Course Reserve Medium Other 8
## 34 Use a library computer Medium Other 10
## 35 Use a group study room Medium Other 5
## 36 Meet up with friends Medium Other 9
## 37 Hang out between classes Low Other 5
## 38 Get help from a librarian Low Other 7
## 39 Use specialized databases (e.g. Bloomberg, Wind) Low Other 2
## 40 Attend a library workshop Low Other 3
## 41 Watch video or listen audio Low Other 2
## 42 Other Low Other 1
## 43 Find a quiet place to study High Total 220
## 44 Print, photocopy, scan High Total 201
## 45 Work on a class assignment/paper High Total 166
## 46 Borrow books and materials High Total 106
## 47 Get readings from Course Reserve Medium Total 44
## 48 Use a library computer Medium Total 41
## 49 Use a group study room Medium Total 41
## 50 Meet up with friends Medium Total 36
## 51 Hang out between classes Low Total 27
## 52 Get help from a librarian Low Total 22
## 53 Use specialized databases (e.g. Bloomberg, Wind) Low Total 17
## 54 Attend a library workshop Low Total 17
## 55 Watch video or listen audio Low Total 11
## 56 Other Low Total 5
The facet_wrap()
option allows us to plot by groups.
ggplot(dtset2, aes(value, reorder(Reason, value))) +
geom_segment(aes(x = 0, y = reorder(Reason, value), xend = value, yend = reorder(Reason, value)), size = 0.3, color = "grey50") +
geom_point(color = palette[[1]][4], size = 2) +
facet_wrap(~variable, nrow = 2) +
## produces the facets in two rows
theme(axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10, margin = margin(0,5,0,0)),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_line(size = 0),
strip.text = element_text(size=12))
Another scenario of grouping plots is “combining graphs”. Facets work on subgroups, but combining puts plots of different topics and types into one graph rather than plots subgroups on one graph.
For instance, we can combine “top reason of visiting library” and “distribution of survey participants”; but we will use faceting for plotting “top reason of visiting library (U.S.)” and “top reason of visiting library (other international students)”.
Below we will create a combined plot on student submissions of “My favorite place to study in Library” with user preference on study environment.
rank_status <-
data.frame(survey %>% group_by(status) %>% summarise(rank_crowded = round(mean(rank_crowded, na.rm =TRUE),2))) %>%
left_join(data.frame(survey %>% group_by(status) %>% summarise(rank_modpop = round(mean(rank_modpop, na.rm =TRUE),2)))) %>%
left_join(data.frame(survey %>% group_by(status) %>% summarise(rank_noisy = round(mean(rank_noisy, na.rm =TRUE),2)))) %>%
left_join(data.frame(survey %>% group_by(status) %>% summarise(rank_quiet = round(mean(rank_quiet, na.rm =TRUE),2)))) %>%
left_join(data.frame(survey %>% group_by(status) %>% summarise(rank_silent = round(mean(rank_silent, na.rm =TRUE),2)))) %>%
rename(`crowded` = rank_crowded, `moderately populated` = rank_modpop, `noisy` = rank_noisy, `quiet (occasional whispers)` = rank_quiet, `(close to) silent` = rank_silent)
rank_status <- rank_status %>% melt()
c1 <-
ggplot(rank_status, aes(x = variable, y = status)) +
geom_tile(aes(fill = value), alpha = 0.95)+
scale_fill_gradient(high = "white", low = palette[[2]][1]) +
geom_text(aes(label = value), size = 2.5, alpha = 0.9)+
scale_y_discrete(limits = rev(levels(rank_status$status))) +
theme(axis.text.x = element_text(size = 12, angle = 90, hjust = 1),
axis.text.y = element_text(size = 12, margin = margin(0,3,0,0)),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_line(size = 0),
plot.subtitle = element_text(size = 10, margin = margin(0,0,15,0)),
legend.title = element_blank(),
legend.position = "none",
plot.margin = unit(c(1,1,1,1), "cm")) +
labs(subtitle = " * Average scores of ranking (1 to 6). Note: limited cases in each group.")
c1
lib <- unlist(strsplit(survey$space_lib, ","))
lib <- str_trim(lib, side = "both")
lib <- lib[!lib %in% c("NULL")]
lib <- data.frame(lib) %>% count(lib) %>% data.frame() %>% arrange(-n)
c2 <-
ggplot(lib, aes(n, reorder(lib, -n), label = n)) +
geom_segment(aes(x = 0, y = reorder(lib, -n), xend = n, yend = reorder(lib, -n)),
size = 0.5, color = "grey50") +
geom_point(size = 5) +
geom_text(color = "white", size = 2) +
coord_flip() +
theme(axis.text.x = element_text(size = 12, angle = 90, hjust = 1),
axis.text.y = element_text(size = 12),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_line(size = 0))
c2
grid.arrange(c1, c2, nrow = 1, top = textGrob("Study Space Preference", gp = gpar(fontsize = 10)))
As you may say, the combination is a bit arbitrary and the combined plot is not so beautiful. But I want to show you the rationale of using “combining graphs” rather than “facets”.