✅ Week 02 Lab - Solutions

2023/24 Autumn Term

Author

Tabtim Duenger

Published

23 Jan 2024

This solution file follows the format of the Quarto file .qmd you had to fill in during the lab session. If you want to render the document yourselves and play with the code, you can download the .qmd version of this solution file by clicking on the button below:

📋 Lab Tasks

⚙️ Setup

library(dplyr)       # for data manipulation
library(tidyr)       # for data reshaping
library(readr)       # for reading data
library(lubridate)   # for working with dates
library(tidyselect)  # for selecting columns

🛠 Part I: Data manipulation with dplyr

The final solution then is:

selected_cols    <- c("Date", "RegionName", "12m%Change")
selected_regions <- c("United Kingdom", "England", "Scotland", "Wales", "Northern Ireland") 

df <-
    uk_hpi %>%
    select(all_of(selected_cols)) %>%
    rename(region = RegionName, date = Date, yearly_change = `12m%Change`) %>%
    filter(region %in% selected_regions) %>%
    drop_na(yearly_change) %>% 
    mutate(date = dmy(date)) %>% 
    arrange(desc(date))

df %>% head(8)

Explanation

  • We use dplyr::select() together with tidyselect::all_of() to select the columns we want to keep.
  • We use dplyr::rename() to rename the columns we want to keep. Note that we use backticks to rename the 12m%Change column because it has a special character in its name. The R compiler would assume we are trying to run some R code if we didn’t use backticks.
  • We use dplyr::filter() to filter the rows we want to keep. We use the %in% operator to check if the region column is in the selected_regions vector. %in% is part of base R, you can read its documentation by running ?%in% in the R console.

📋 Part 2: Data visualisation with ggplot

Step-by-step

If you are not familiar with using ggplot for plotting, the line by line construction of the plot below might be useful. The + operator is used to add layers to the plot.

library(ggplot2)

# The plot starts in June 2019
plot_df <- df %>% filter(date >= dmy("01-06-2019")) 

# We must convert the region to a factor to control the order of the legend
countries <- c("United Kingdom", "England", "Scotland", 
               "Wales", "Northern Ireland")
plot_df$region <- factor(plot_df$region, levels = countries)

# Mapping the data to the aesthetics
g <- ggplot(plot_df, aes(x = date, y = yearly_change, color = region))

# Add geoms
g <- g + geom_line(linewidth=1.5, alpha=0.8)

# Customise the axis
g <- g +
    scale_x_date(name = "Date", 
                 date_labels = "%b %Y", 
                 date_breaks="6 months",
                 date_minor_breaks="1 month",
                 expand=expansion(add=c(0, 10))) +
    scale_y_continuous(name = "Yearly change (%)", 
                       limits = c(-4, 20), 
                       breaks=seq(-4, 22, 2)) 

# Customise the legend
color_mapping <- c("United Kingdom"="#F78E1E",
                   "England"="#A7A9AC",
                   "Scotland"="#83D2E4",
                   "Wales"="#EBE72A",
                   "Northern Ireland"="#A0CF67") 

g <- g + scale_color_manual(name="Region", values = color_mapping)

# Customising the theme
g <- g + theme_bw() 

g

Single ggplot call

library(ggplot2)

# The plot starts in June 2019
plot_df <- df %>% filter(date >= dmy("01-06-2019"))

# We must convert the region to a factor to control the order of the legend
countries <- c("United Kingdom", "England", "Scotland", 
               "Wales", "Northern Ireland")
plot_df$region <- factor(plot_df$region, levels = countries)

# Customise the legend
color_mapping <- c("United Kingdom"="#F78E1E",
                   "England"="#A7A9AC",
                   "Scotland"="#83D2E4",
                   "Wales"="#EBE72A",
                   "Northern Ireland"="#A0CF67") 

# Mapping the data to the aesthetics
g <- 
    ggplot(plot_df, 
           aes(x = date, y = yearly_change, color = region)) +
    geom_line(linewidth=1.5, alpha=0.8) +
    scale_x_date(name = "Date", 
                 date_labels = "%b %Y", 
                 date_breaks="6 months",
                 date_minor_breaks="1 month",
                 expand=expansion(add=c(0, 10))) +
    scale_y_continuous(name = "Yearly change (%)", 
                       limits = c(-4, 20), 
                       breaks=seq(-4, 22, 2)) +
    scale_color_manual(name="Region", values = color_mapping) +
    theme_bw()

g

🏡 Bonus Task

price_plot <- uk_hpi %>%
  # Filter for regions
  filter(RegionName %in% c("Outer London", "Inner London", "United Kingdom")) %>%
  select(Date, RegionName, AveragePrice) %>%
  # Pivot dataframe from tall to wide
  pivot_wider(names_from = RegionName, values_from = c("AveragePrice")) %>%
  # Drop NAs
  drop_na() %>%
  # Change price format
  mutate(Date = as.Date(Date, format = "%d/%m/%Y")) %>%
  # Sort by date
  arrange(desc(Date)) %>%
  # Create new variables
  mutate(AveragePriceComparisonInnerLondon = `AveragePrice_Inner London`/`AveragePrice_United Kingdom`,
         AveragePriceComparisonOuterLondon = `AveragePrice_Outer London`/`AveragePrice_United Kingdom`) %>%
  # Select the new variables
  select(Date, AveragePriceComparisonInnerLondon, AveragePriceComparisonOuterLondon, `AveragePrice_United Kingdom`)
ggplot(data = price_plot, aes(x = Date)) +
  geom_line(aes(y = AveragePriceComparisonInnerLondon, color = "Inner London")) +
  geom_line(aes(y = AveragePriceComparisonOuterLondon, color = "Outer London")) +
  # Add title and axis titles
  labs(title = "Average difference in house price between regions of London and the United Kingdom",
       y = "Relative house price",
       x = "Date",
       color = "Region") +
  # Change theme
  theme_minimal() +
  # Change y limits
  ylim(0, 3) +
  # Add a horizontal line at y=1
  geom_hline(yintercept = 1, linetype = "dashed", size = 0.5) +
  # Change colour scheme for the lines
  scale_color_brewer(palette="Dark2") +
  # Format the x axis labels
  scale_x_date(labels = scales::date_format("%Y-%m")) +
  # Change the title size
  theme(
    plot.title = element_text(size = 12)
  )