✅ Week 02 Lab - Solutions

2023/24 Autumn Term

Author
library(dplyr)       # for data manipulation
library(tidyr)       # also for data manipulation
library(readr)       # for reading data
library(lubridate)   # for working with dates
library(tidyselect)  # for selecting columns

Part III - Lab Solutions

# This assumes you have a folder called `data` in the same directory as this file
# and that you have downloaded the CSV file into that folder
filepath <- "data/UK-HPI-full-file-2023-06.csv"
uk_hpi <- read_csv(filepath)

The final solution then is:

selected_cols    <- c("Date", "RegionName", "12m%Change")
selected_regions <- c("United Kingdom", "England", "Scotland", "Wales", "Northern Ireland") 

df <-
    uk_hpi %>%
    select(all_of(selected_cols)) %>%
    rename(region = RegionName, date = Date, yearly_change = `12m%Change`) %>%
    filter(region %in% selected_regions) %>%
    drop_na(yearly_change) %>% 
    mutate(date = dmy(date)) %>% 
    arrange(desc(date))

df %>% head(8)

Explanation

  • We use dplyr::select() together with tidyselect::all_of() to select the columns we want to keep.
  • We use dplyr::rename() to rename the columns we want to keep. Note that we use backticks to rename the 12m%Change column because it has a special character in its name. The R compiler would assume we are trying to run some R code if we didn’t use backticks.
  • We use dplyr::filter() to filter the rows we want to keep. We use the %in% operator to check if the region column is in the selected_regions vector. %in% is part of base R, you can read its documentation by running ?%in% in the R console.

📋 Bonus Task

The plot:

If you are not familiar with using ggplot for plotting, the line by line construction of the plot below might be useful. The + operator is used to add layers to the plot.

library(ggplot2)

# The plot starts in June 2019
plot_df <- df %>% filter(date >= dmy("01-06-2019")) 

# We must convert the region to a factor to control the order of the legend
countries <- c("United Kingdom", "England", "Scotland", 
               "Wales", "Northern Ireland")
plot_df$region <- factor(plot_df$region, levels = countries)

# Mapping the data to the aesthetics
g <- ggplot(plot_df, aes(x = date, y = yearly_change, color = region))

# Add geoms
g <- g + geom_line(linewidth=1.5, alpha=0.8)

# Customise the axis
g <- g +
    scale_x_date(name = "Date", 
                 date_labels = "%b %Y", 
                 date_breaks="6 months",
                 date_minor_breaks="1 month",
                 expand=expansion(add=c(0, 10))) +
    scale_y_continuous(name = "Yearly change (%)", 
                       limits = c(-4, 20), 
                       breaks=seq(-4, 22, 2)) 

# Customise the legend
color_mapping <- c("United Kingdom"="#F78E1E",
                   "England"="#A7A9AC",
                   "Scotland"="#83D2E4",
                   "Wales"="#EBE72A",
                   "Northern Ireland"="#A0CF67") 

g <- g + scale_color_manual(name="Region", values = color_mapping)

# Customising the theme
g <- g + theme_bw() 

g
library(ggplot2)

# The plot starts in June 2019
plot_df <- df %>% filter(date >= dmy("01-06-2019"))

# We must convert the region to a factor to control the order of the legend
countries <- c("United Kingdom", "England", "Scotland", 
               "Wales", "Northern Ireland")
plot_df$region <- factor(plot_df$region, levels = countries)

# Customise the legend
color_mapping <- c("United Kingdom"="#F78E1E",
                   "England"="#A7A9AC",
                   "Scotland"="#83D2E4",
                   "Wales"="#EBE72A",
                   "Northern Ireland"="#A0CF67") 

# Mapping the data to the aesthetics
g <- 
    ggplot(plot_df, 
           aes(x = date, y = yearly_change, color = region)) +
    geom_line(linewidth=1.5, alpha=0.8) +
    scale_x_date(name = "Date", 
                 date_labels = "%b %Y", 
                 date_breaks="6 months",
                 date_minor_breaks="1 month",
                 expand=expansion(add=c(0, 10))) +
    scale_y_continuous(name = "Yearly change (%)", 
                       limits = c(-4, 20), 
                       breaks=seq(-4, 22, 2)) +
    scale_color_manual(name="Region", values = color_mapping) +
    theme_bw()

g