✅ (Solutions) Lab 02

Author

Published

11 July 2023

Solutions to the Lab 02 exercises.

(We’re showing the code with explanations, but you probably solved it more concisely, in s single .R script.)

Part 1

Imports

#### PART 1: ⚙️ Setup ####

library(readr)
library(dplyr)
library(xml2)

Data Input

# Adjust the path to the data file as needed
df <- read_csv("../data/tesco-grocery/Dec_lsoa_grocery.csv")

Part 2

In case you want to see the data

selected_cols <- c("area_id",
                   "fat", "saturate", "salt", "protein", "sugar",
                   "protein", "carb", "fibre", "alcohol")

View(df[selected_cols])
# or
df %>% select(all_of(selected_cols)) %>% View()

Create an initial empty XML document with just the tag

First, you will need to load this new library called xml2:

library(xml2)

Then, you can create an empty XML document with the xml_new_root() function:

#### PART 2: Working with XML ####

# Specifying the encoding is optional but important to avoid encoding issues
tesco_data_xml <- xml_new_root("data", .encoding = "UTF-8")
tesco_data_xml

Create a new XML node for each row of the data

row1 <- df %>% slice(1)

# Create a new XML node. 
area_node <- xml2::xml_new_root("area", area_id = row1$area_id)
area_node

If you were to save this to a .xml file (with xml2::write_xml(area_node, 'sample.xml')) and open it in a text editor, you would see something like this:

<?xml version="1.0" encoding="UTF-8"?>
<area area_id="E01000001"/>

Note that the area_id attribute is not a child node, but an attribute of the area node. The tag area has no children and for that reason, the xml2 package automatically closes it with a / at the end: <area ... />.

Click here to see alternative ways to do the same thing

You didn’t need to create a new variable row1 to store the first row of the data. You could have done it directly in the xml_new_root() function:

# Alternative 1: (advanced) Pure pipe chain
area_node <- 
    xml2::xml_new_root("area", 
                       area_id = df %>% slice(1) %>% pull(area_id))

The pull function is a shortcut for dplyr::pull(df, area_id). It extracts the area_id column from the data frame and returns it as a vector.

Another alternative is to just use base R, which will probably be easier to understand:

# Alternative 2: (easier) Base R
area_node <- xml2::xml_new_root("area", area_id = df[1, "area_id"])

Now, add the other nutrients as ‘children’ of the area node

Here we will use a for loop to iterate over the nutrient names and add them as children of the area node:

# Create a new node for the nutrients
for(nutrient_name in selected_cols[-1]){
  xml_add_child(area_node, nutrient_name, row1[[nutrient_name]])
}

(In the future, we will show you a more advanced way to do this using the purrr package.)

If you were to save this to a .xml file (with xml2::write_xml(area_node, 'sample.xml')) and open it in a text editor, you would see something like this:

<?xml version="1.0" encoding="UTF-8"?>
<area area_id="E01000001">
  <fat>9.02807973835848</fat>
  <saturate>3.7293430929761</saturate>
  <salt>0.556402429528114</salt>
  <protein>5.38504905777922</protein>
  <sugar>9.65265534963392</sugar>
  <protein>5.38504905777922</protein>
  <carb>16.237019155895</carb>
  <fibre>1.67400716399314</fibre>
  <alcohol>0.347539336551938</alcohol>
</area>

Then add the area_node to the root node:

xml2::xml_add_child(tesco_data_xml, area_node)

Then, save the XML:

xml2::write_xml(tesco_data_xml, "sample_tesco_data.xml")

If you open this file on a text editor, you will see something like this:

<?xml version="1.0" encoding="UTF-8"?>
<data>
  <area area_id="E01000001">
    <fat>9.02807973835848</fat>
    <saturate>3.7293430929761</saturate>
    <salt>0.556402429528114</salt>
    <protein>5.38504905777922</protein>
    <sugar>9.65265534963392</sugar>
    <protein>5.38504905777922</protein>
    <carb>16.237019155895</carb>
    <fibre>1.67400716399314</fibre>
    <alcohol>0.347539336551938</alcohol>
  </area>
</data>

Part 3

This part calls for a custom function. As you will need to run the same code again and again for each row in the dataset, it makes sense to create a function that does this for you. Let’s call it get_area_node:

get_area_node <- function(row){
  area_node <- xml2::xml_new_root("area", area_id = row$area_id)
  
  for(nutrient_name in selected_cols[-1]){
    xml_add_child(area_node, nutrient_name, row[[nutrient_name]])
  }
  
  return(area_node)
}

Then, you will need an external for loop to iterate over the rows of the data frame:

tesco_data_xml <- xml_new_root("data", .encoding = "UTF-8")
# In future labs (Week 02 onwards), we will stop using for loops
# and use purrr::map() and derivatives
for (i in 1:10) {
  tesco_data_xml %>% xml_add_child(get_area_node(df[i, ]))
}

write_xml(tesco_data_xml, "sample_tesco_data.xml")

Part 4

All you have to change is the get_area_node() to add a <nutrients> tag:

get_area_node <- function(row){
  area_node <- xml2::xml_new_root("area", area_id = row$area_id)
  
  # Create the 'nutrients' parent node
  nutrients_node <- xml2::xml_add_child(area_node, "nutrients")
  
  for(nutrient_name in selected_cols[-1]){
    xml_add_child(nutrients_node, nutrient_name, row[[nutrient_name]])
  }
  
  return(area_node)
}

The rest of the code is the same as before:

tesco_data_xml <- xml_new_root("data", .encoding = "UTF-8")
for (i in 1:10) {
  tesco_data_xml %>% xml_add_child(get_area_node(df[i, ]))
}

write_xml(tesco_data_xml, "sample_tesco_data.xml")

🏡 Bonus Task

Full solution:

library(xml2)
library(dplyr)

#### CONSTANTS ####

NUTRIENTS <- c("fat", "saturate", "salt", 
               "protein", "sugar", "protein", 
               "carb", "fibre", "alcohol")

STATS_SUFFIXES <- c("", "std", "ci95",
                    "perc2.5", "perc25", 
                    "perc50", "perc75", 
                    "perc97.5")

#### FUNCTIONS ####

get_nutrient_node <- function(row, nutrient_name){
  nutrient_node <- xml_new_root(nutrient_name)
  stat_col_names <- if_else(STATS_SUFFIXES == "", 
                            nutrient_name,
                            paste(nutrient_name, STATS_SUFFIXES, sep="_"))


  for(stat_col in stat_col_names){
    stat_element <- xml_new_root(stat_col, row[[stat_col]])
    xml2::xml_add_child(nutrient_node, stat_element)
  }
  
  return(nutrient_node)
}

get_area_node <- function(row){
  area_node <- xml2::xml_new_root("area", area_id = row$area_id)
  
  for(nutrient_name in NUTRIENTS){
    xml2::xml_add_child(area_node, get_nutrient_node(row, nutrient_name))
  }
  
  return(area_node)
}

#### MAIN ####

df <- read_csv("2023/data/tesco-grocery/Dec_lsoa_grocery.csv")

tesco_data_xml <- xml_new_root("data", .encoding = "UTF-8")
for (i in 1:10) {
  tesco_data_xml %>% xml_add_child(get_area_node(df[i, ]))
}

write_xml(tesco_data_xml, "sample_tesco_data.xml")