✅ Week 02 Lab - Solutions
2023/24 Autumn Term
library(dplyr) # for data manipulation
library(tidyr) # also for data manipulation
library(readr) # for reading data
library(lubridate) # for working with dates
library(tidyselect) # for selecting columns
Part III - Lab Solutions
# This assumes you have a folder called `data` in the same directory as this file
# and that you have downloaded the CSV file into that folder
<- "data/UK-HPI-full-file-2023-06.csv"
filepath <- read_csv(filepath) uk_hpi
The final solution then is:
<- c("Date", "RegionName", "12m%Change")
selected_cols <- c("United Kingdom", "England", "Scotland", "Wales", "Northern Ireland")
selected_regions
<-
df %>%
uk_hpi select(all_of(selected_cols)) %>%
rename(region = RegionName, date = Date, yearly_change = `12m%Change`) %>%
filter(region %in% selected_regions) %>%
drop_na(yearly_change) %>%
mutate(date = dmy(date)) %>%
arrange(desc(date))
%>% head(8) df
Explanation
- We use
dplyr::select()
together withtidyselect::all_of()
to select the columns we want to keep. - We use
dplyr::rename()
to rename the columns we want to keep. Note that we use backticks to rename the12m%Change
column because it has a special character in its name. The R compiler would assume we are trying to run some R code if we didn’t use backticks. - We use
dplyr::filter()
to filter the rows we want to keep. We use the%in%
operator to check if theregion
column is in theselected_regions
vector.%in%
is part of base R, you can read its documentation by running?%in%
in the R console.
📋 Bonus Task
The plot:
If you are not familiar with using ggplot for plotting, the line by line construction of the plot below might be useful. The +
operator is used to add layers to the plot.
library(ggplot2)
# The plot starts in June 2019
<- df %>% filter(date >= dmy("01-06-2019"))
plot_df
# We must convert the region to a factor to control the order of the legend
<- c("United Kingdom", "England", "Scotland",
countries "Wales", "Northern Ireland")
$region <- factor(plot_df$region, levels = countries)
plot_df
# Mapping the data to the aesthetics
<- ggplot(plot_df, aes(x = date, y = yearly_change, color = region))
g
# Add geoms
<- g + geom_line(linewidth=1.5, alpha=0.8)
g
# Customise the axis
<- g +
g scale_x_date(name = "Date",
date_labels = "%b %Y",
date_breaks="6 months",
date_minor_breaks="1 month",
expand=expansion(add=c(0, 10))) +
scale_y_continuous(name = "Yearly change (%)",
limits = c(-4, 20),
breaks=seq(-4, 22, 2))
# Customise the legend
<- c("United Kingdom"="#F78E1E",
color_mapping "England"="#A7A9AC",
"Scotland"="#83D2E4",
"Wales"="#EBE72A",
"Northern Ireland"="#A0CF67")
<- g + scale_color_manual(name="Region", values = color_mapping)
g
# Customising the theme
<- g + theme_bw()
g
g
library(ggplot2)
# The plot starts in June 2019
<- df %>% filter(date >= dmy("01-06-2019"))
plot_df
# We must convert the region to a factor to control the order of the legend
<- c("United Kingdom", "England", "Scotland",
countries "Wales", "Northern Ireland")
$region <- factor(plot_df$region, levels = countries)
plot_df
# Customise the legend
<- c("United Kingdom"="#F78E1E",
color_mapping "England"="#A7A9AC",
"Scotland"="#83D2E4",
"Wales"="#EBE72A",
"Northern Ireland"="#A0CF67")
# Mapping the data to the aesthetics
<-
g ggplot(plot_df,
aes(x = date, y = yearly_change, color = region)) +
geom_line(linewidth=1.5, alpha=0.8) +
scale_x_date(name = "Date",
date_labels = "%b %Y",
date_breaks="6 months",
date_minor_breaks="1 month",
expand=expansion(add=c(0, 10))) +
scale_y_continuous(name = "Yearly change (%)",
limits = c(-4, 20),
breaks=seq(-4, 22, 2)) +
scale_color_manual(name="Region", values = color_mapping) +
theme_bw()
g