✅ (Solutions) Lab 01
Solutions to the lab 01 exercises.
Part 1
Part 2
Base R version
# Adjust the path accordingly
<- read.csv("../data/tesco-grocery/Dec_lsoa_grocery.csv") df
Filtering the data frame to show only the rows for the area with code E01004735
:
"area_id"] == "E01004735", ] df[df[
Tidyverse version
library(tidyverse)
# Note the use of read_csv instead of read.csv
<- read_csv("../data/tesco-grocery/Dec_lsoa_grocery.csv") df
Filtering the data frame to show only the rows for the area with code E01004735
:
# Note also that in recent versions of tidyverse |> is the same as %>%.
%>% filter(area_id == "E01004735") df
Part 3
Base R FULL solution:
<- read.csv("../data/tesco-grocery/Dec_lsoa_grocery.csv")
df
#### STEP 1
<- c("area_id",
selected_cols "fat", "saturate", "salt", "protein", "sugar",
"protein", "carb", "fibre", "alcohol",
"population", "male", "female",
"age_0_17", "age_18_64", "age_65.", "avg_age",
"area_sq_km", "people_per_sq_km"
)<- df[selected_cols]
df
#### STEP 2
# Find regions with the highest average alcohol consumption
<- df[order(df$alcohol, decreasing = TRUE), ]
highest_alcohol_regions <- head(highest_alcohol_regions, 3)
highest_alcohol_regions
# Find regions with the lowest average alcohol consumption
<- df[order(df$alcohol), ]
lowest_alcohol_regions <- head(lowest_alcohol_regions, 3)
lowest_alcohol_regions
# Extract region names for highest and lowest alcohol consumption
<- highest_alcohol_regions$area_id
highest_alcohol_names <- lowest_alcohol_regions$area_id
lowest_alcohol_names
# Find regions with the highest average sugar consumption
<- df[order(df$sugar, decreasing = TRUE), ]
highest_sugar_regions <- head(highest_sugar_regions, 3)
highest_sugar_regions
# Find regions with the lowest average sugar consumption
<- df[order(df$sugar), ]
lowest_sugar_regions <- head(lowest_sugar_regions, 3)
lowest_sugar_regions
# Extract region names for highest and lowest sugar consumption
<- highest_sugar_regions$area_id
highest_sugar_names <- lowest_sugar_regions$area_id
lowest_sugar_names
# Print the results
cat("Regions with highest average alcohol consumption:", "\n",
"\n\n")
highest_alcohol_names,
cat("Regions with lowest average alcohol consumption:", "\n",
"\n\n")
lowest_alcohol_names,
cat("Regions with highest average sugar consumption:", "\n",
"\n\n")
highest_sugar_names,
cat("Regions with lowest average sugar consumption:", "\n",
"\n")
lowest_sugar_names,
#### STEP 3
<- data.frame(mean_pop = mean(df$population),
pop_stats std_pop = sd(df$population))
print(pop_stats)
#### STEP 4
# Choose two nutrients
<- df$alcohol
nutrient1 <- df$sugar
nutrient2
# Create scatterplot
plot(nutrient1, nutrient2, main = "Scatterplot of Nutrient1 vs Nutrient2",
xlab = "Alcohol", ylab = "Sugar", pch = 16)
Tidyverse solution:
#### STEP 1
library(dplyr) # alternatively, library(tidyverse)
library(readr)
<- read_csv("../data/tesco-grocery/Dec_lsoa_grocery.csv")
df
<- c("area_id",
selected_cols "fat", "saturate", "salt", "protein", "sugar",
"protein", "carb", "fibre", "alcohol",
"population", "male", "female",
"age_0_17", "age_18_64", "age_65+", "avg_age",
"area_sq_km", "people_per_sq_km"
)
<- df %>% select(all_of(selected_cols))
df
#### STEP 2
<- df %>%
highest_alcohol_regions arrange(desc(alcohol)) %>%
head(3)
<- df %>%
lowest_alcohol_regions arrange(alcohol) %>%
head(3)
<- highest_alcohol_regions$area_id
highest_alcohol_names <- lowest_alcohol_regions$area_id
lowest_alcohol_names
<- df %>%
highest_sugar_regions arrange(desc(sugar)) %>%
head(3)
<- df %>%
lowest_sugar_regions arrange(sugar) %>%
head(3)
<- highest_sugar_regions$area_id
highest_sugar_names <- lowest_sugar_regions$area_id
lowest_sugar_names
cat("Regions with highest average alcohol consumption:", "\n",
"\n\n")
highest_alcohol_names,
cat("Regions with lowest average alcohol consumption:", "\n",
"\n\n")
lowest_alcohol_names,
cat("Regions with highest average sugar consumption:", "\n",
"\n\n")
highest_sugar_names,
cat("Regions with lowest average sugar consumption:", "\n",
"\n")
lowest_sugar_names,
#### STEP 3
<- df %>%
pop_stats summarise(mean_pop = mean(population), sd_pop=sd(population))
print(pop_stats)
#### STEP 4
library(ggplot2) # if you haven't loaded tidyverse
# Choose two nutrients
<- df$alcohol
nutrient1 <- df$sugar
nutrient2
# Create scatterplot
ggplot(data = df, aes(x = nutrient1, y = nutrient2)) +
geom_point() +
labs(title = "Scatterplot of Alcohol vs Sugar",
x = "Alcohol", y = "Sugar")