✅ Week 02 Lab - Solutions
2023/24 Autumn Term
This solution file follows the format of the Quarto file .qmd
you had to fill in during the lab session. If you want to render the document yourselves and play with the code, you can download the .qmd
version of this solution file by clicking on the button below:
📋 Lab Tasks
⚙️ Setup
library(dplyr) # for data manipulation
library(tidyr) # for data reshaping
library(readr) # for reading data
library(lubridate) # for working with dates
library(tidyselect) # for selecting columns
🛠 Part I: Data manipulation with dplyr
The final solution then is:
<- c("Date", "RegionName", "12m%Change")
selected_cols <- c("United Kingdom", "England", "Scotland", "Wales", "Northern Ireland")
selected_regions
<-
df %>%
uk_hpi select(all_of(selected_cols)) %>%
rename(region = RegionName, date = Date, yearly_change = `12m%Change`) %>%
filter(region %in% selected_regions) %>%
drop_na(yearly_change) %>%
mutate(date = dmy(date)) %>%
arrange(desc(date))
%>% head(8) df
Explanation
- We use
dplyr::select()
together withtidyselect::all_of()
to select the columns we want to keep. - We use
dplyr::rename()
to rename the columns we want to keep. Note that we use backticks to rename the12m%Change
column because it has a special character in its name. The R compiler would assume we are trying to run some R code if we didn’t use backticks. - We use
dplyr::filter()
to filter the rows we want to keep. We use the%in%
operator to check if theregion
column is in theselected_regions
vector.%in%
is part of base R, you can read its documentation by running?%in%
in the R console.
📋 Part 2: Data visualisation with ggplot
Step-by-step
If you are not familiar with using ggplot for plotting, the line by line construction of the plot below might be useful. The +
operator is used to add layers to the plot.
library(ggplot2)
# The plot starts in June 2019
<- df %>% filter(date >= dmy("01-06-2019"))
plot_df
# We must convert the region to a factor to control the order of the legend
<- c("United Kingdom", "England", "Scotland",
countries "Wales", "Northern Ireland")
$region <- factor(plot_df$region, levels = countries)
plot_df
# Mapping the data to the aesthetics
<- ggplot(plot_df, aes(x = date, y = yearly_change, color = region))
g
# Add geoms
<- g + geom_line(linewidth=1.5, alpha=0.8)
g
# Customise the axis
<- g +
g scale_x_date(name = "Date",
date_labels = "%b %Y",
date_breaks="6 months",
date_minor_breaks="1 month",
expand=expansion(add=c(0, 10))) +
scale_y_continuous(name = "Yearly change (%)",
limits = c(-4, 20),
breaks=seq(-4, 22, 2))
# Customise the legend
<- c("United Kingdom"="#F78E1E",
color_mapping "England"="#A7A9AC",
"Scotland"="#83D2E4",
"Wales"="#EBE72A",
"Northern Ireland"="#A0CF67")
<- g + scale_color_manual(name="Region", values = color_mapping)
g
# Customising the theme
<- g + theme_bw()
g
g
Single ggplot call
library(ggplot2)
# The plot starts in June 2019
<- df %>% filter(date >= dmy("01-06-2019"))
plot_df
# We must convert the region to a factor to control the order of the legend
<- c("United Kingdom", "England", "Scotland",
countries "Wales", "Northern Ireland")
$region <- factor(plot_df$region, levels = countries)
plot_df
# Customise the legend
<- c("United Kingdom"="#F78E1E",
color_mapping "England"="#A7A9AC",
"Scotland"="#83D2E4",
"Wales"="#EBE72A",
"Northern Ireland"="#A0CF67")
# Mapping the data to the aesthetics
<-
g ggplot(plot_df,
aes(x = date, y = yearly_change, color = region)) +
geom_line(linewidth=1.5, alpha=0.8) +
scale_x_date(name = "Date",
date_labels = "%b %Y",
date_breaks="6 months",
date_minor_breaks="1 month",
expand=expansion(add=c(0, 10))) +
scale_y_continuous(name = "Yearly change (%)",
limits = c(-4, 20),
breaks=seq(-4, 22, 2)) +
scale_color_manual(name="Region", values = color_mapping) +
theme_bw()
g
🏡 Bonus Task
<- uk_hpi %>%
price_plot # Filter for regions
filter(RegionName %in% c("Outer London", "Inner London", "United Kingdom")) %>%
select(Date, RegionName, AveragePrice) %>%
# Pivot dataframe from tall to wide
pivot_wider(names_from = RegionName, values_from = c("AveragePrice")) %>%
# Drop NAs
drop_na() %>%
# Change price format
mutate(Date = as.Date(Date, format = "%d/%m/%Y")) %>%
# Sort by date
arrange(desc(Date)) %>%
# Create new variables
mutate(AveragePriceComparisonInnerLondon = `AveragePrice_Inner London`/`AveragePrice_United Kingdom`,
AveragePriceComparisonOuterLondon = `AveragePrice_Outer London`/`AveragePrice_United Kingdom`) %>%
# Select the new variables
select(Date, AveragePriceComparisonInnerLondon, AveragePriceComparisonOuterLondon, `AveragePrice_United Kingdom`)
ggplot(data = price_plot, aes(x = Date)) +
geom_line(aes(y = AveragePriceComparisonInnerLondon, color = "Inner London")) +
geom_line(aes(y = AveragePriceComparisonOuterLondon, color = "Outer London")) +
# Add title and axis titles
labs(title = "Average difference in house price between regions of London and the United Kingdom",
y = "Relative house price",
x = "Date",
color = "Region") +
# Change theme
theme_minimal() +
# Change y limits
ylim(0, 3) +
# Add a horizontal line at y=1
geom_hline(yintercept = 1, linetype = "dashed", size = 0.5) +
# Change colour scheme for the lines
scale_color_brewer(palette="Dark2") +
# Format the x axis labels
scale_x_date(labels = scales::date_format("%Y-%m")) +
# Change the title size
theme(
plot.title = element_text(size = 12)
)