I've got some data that represents two groups, with a before and after result for both groups. When I plot the data together, it looks as though the after is much tighter around the mean. However, when I plot the groups individually, that picture changes drastically. If anyone could help me understand why this is so, I would greatly appreciate it. I've pasted the R code below.
library(dplyr) library(ggplot2) my_df <- structure(list(Group = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1), Status = c("Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "Before", "After", "After", "After", "After", "After", "After", "After", "Before", "After", "Before", "Before", "Before", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After", "After"), Result = c(2.39000010490417, 2.00999999046326, 2.26999998092651, 2.4300000667572, 4.19999980926514, 4.01999998092651, 4.26999998092651, 4.01999998092651, 4.1399998664856, 4.05000019073486, 4.30999994277954, 4.01999998092651, 4.01999998092651, 2.47000002861023, 2.25, 2.39000010490417, 4.28999996185303, 4.09000015258789, 4.1399998664856, 4, 4.23999977111816, 4.17000007629395, 4.21000003814697, 4.1399998664856, 4.57999992370605, 2.24000000953674, 2.44000005722046, 2.42000007629395, 3.83999991416931, 4.01000022888184, 3.75, 4.01000022888184, 3.78999996185303, 3.85999989509583, 3.94000005722046, 3.96000003814697, 4.17000007629395, 4, 4.32000017166138, 4.07999992370605, 2.46000003814697, 2.60999989509583, 2.23000001907349, 2.13000011444092, 4.46999979019165, 4.09000015258789, 4.1100001335144, 4.17000007629395, 3.86999988555908, 4.5, 3.9300000667572, 2.15000009536743, 2.35999989509583, 4.46999979019165, 4.48000001907349, 4.3600001335144, 4.19000005722046, 4.28000020980835, 4.82999992370605, 4.15000009536743, 4.42000007629395, 4.15000009536743, 4.19999980926514, 4.44000005722046, 4.21999979019165, 4.38000011444092, 3.94000005722046, 4.57000017166138, 2.32999992370605, 2.44000005722046, 2.09999990463257, 2.17000007629395, 2.17000007629395, 2.61999988555908, 4.09999990463257, 3.85999989509583, 4.15999984741211, 4.19000005722046, 4.09999990463257, 3.97000002861023, 4.19999980926514, 4.32999992370605, 4.07999992370605, 3.8199999332428, 4.01999998092651, 4.15999984741211, 3.9300000667572, 4.1399998664856, 3.77999997138977, 4.11999988555908, 4.53999996185303, 4.07000017166138, 2.54999995231628, 2.50999999046326, 2.4300000667572, 2.32999992370605, 3.85999989509583, 3.92000007629395, 4.3600001335144, 4.30000019073486, 4.34000015258789, 4.1399998664856, 4.25, 4.13000011444092, 4.03999996185303, 4.26999998092651, 4.32000017166138, 4.11999988555908, 4.05000019073486, 4.44000005722046, 4.1100001335144, 4.19000005722046, 4.28000020980835, 4.51999998092651, 4.07999992370605, 4.07000017166138, 4.05000019073486, 4.46000003814697, 4.05000019073486, 2.52999997138977)), class = "data.frame", row.names = c(NA, -120L ), .Names = c("Group", "Status", "Result")) my_df %>% ggplot() + geom_density(aes(Result, fill=Status), alpha=.3) my_df %>% filter(Group == 1) %>% ggplot() + geom_density(aes(Result, fill=Status), alpha=.3) my_df %>% filter(Group == 2) %>% ggplot() + geom_density(aes(Result, fill=Status), alpha=.3)
Best Answer
You're seeing the effect of R choosing different bandwidths for the grouped and ungrouped plots. A smaller bandwidth will result in greater resolution (along the horizontal axis) of peaks and valleys in the data, while a larger bandwidth will smear them out. You can see this if you set the bandwidths yourself:
library(ggplot2) library(gridExtra) library(dplyr) grid.arrange(arrangeGrob(ggplot(my_df) + geom_density(aes(Result, fill=Status), alpha=.3, bw=0.2) + facet_grid(Group ~ .) + ggtitle("bw=0.2"), ggplot(my_df) + geom_density(aes(Result, fill=Status), alpha=.3, bw=0.05) + facet_grid(Group ~ .) + ggtitle("bw=0.05"), ncol=2), arrangeGrob(ggplot(my_df) + geom_density(aes(Result, fill=Status), alpha=.3, bw=0.2) + ggtitle("bw=0.2"), ggplot(my_df) + geom_density(aes(Result, fill=Status), alpha=.3, bw=0.05) + ggtitle("bw=0.05"), ncol=2), ncol=1, heights=c(3,2))
Notice that the grouped and ungrouped plots are the same (save for scaling) when the bandwidths (bw
) are the same.
I'm not sure if there's a way to extract the chosen bandwidth from within ggplot. However, ggplot is using the density
function to generate the density plot, and density
uses bw.nrd0
to choose the bandwidth. So let's use bw.nrd0
to see what bandwidths are being chosen. In the output below, notice how R chooses narrower bandwidths when considering each group separately, as compared with the case where both groups are combined.
Each Status
separately:
sapply(split(my_df, my_df$Status), function(d) {bw.nrd0(d$Result)})
After Before 0.1224065 0.2983324
Each Group
and Status
separately:
sapply(split(my_df, interaction(my_df$Status, my_df$Group)), function(d) {bw.nrd0(d$Result)})
After.1 Before.1 After.2 Before.2 0.09229781 0.07725860 0.07847190 0.06391614
UPDATE: Regarding your comment, do you mean the "spike" in the uppermost plot in your question? I think this is a scaling effect due to the relative number of points in each group.
my_df %>% group_by(Group, Status) %>% tally %>% ungroup %>% mutate(Percent=round(n/sum(n)*100,1))
Group Status n Percent <dbl> <chr> <int> <dbl> 1 1 After 13 10.8 2 1 Before 14 11.7 3 2 After 52 43.3 4 2 Before 41 34.2
Note that Group==2
has many more observations than Group==1
. If we equalize the number of observations in each group, then this scaling effect goes away:
grid.arrange(my_df %>% ggplot() + geom_density(aes(Result, fill=Status), alpha=.3) + ggtitle("Original Data"), my_df %>% group_by(Status, Group) %>% sample_n(100, replace=TRUE) %>% ggplot() + geom_density(aes(Result, fill=Status), alpha=.3) + ggtitle("Resampled so each group has equal number of data points"), ncol=1)
You might find it easier to compare groups using another type of plot. Below are box and violin plots. The latter is a density plot, but each group's "violin" is scaled so that they all have the same area, which might make it easier to compare the shapes and locations of each distribution.
grid.arrange(ggplot(my_df, aes(x=factor(Group), y=Result, colour=Status)) + geom_boxplot(width=0.4, position=position_dodge(0.5)) + theme_bw(), ggplot(my_df, aes(x=factor(Group), y=Result, colour=Status)) + geom_violin(bw=0.07) + theme_bw(), ncol=2)
Or a histogram, which will show you where the data points are, without the model-based smoothing of the density estimate (though of course the binwidth will affect the shape of the plot):
grid.arrange(ggplot(my_df) + geom_histogram(aes(Result, fill=Status), alpha=.4, binwidth=0.1) + facet_grid(Group ~ .) + theme_bw(), ggplot(my_df) + geom_histogram(aes(Result, fill=Status), alpha=.4, , binwidth=0.1) + theme_bw(), ncol=1, heights=c(3,2))