✅ (Solutions) Lab 02
Solutions to the Lab 02 exercises.
(We’re showing the code with explanations, but you probably solved it more concisely, in s single .R
script.)
Part 1
Part 1
Imports
#### PART 1: ⚙️ Setup ####
library(readr)
library(dplyr)
library(xml2)
Data Input
# Adjust the path to the data file as needed
<- read_csv("../data/tesco-grocery/Dec_lsoa_grocery.csv") df
Part 2
Part 2
In case you want to see the data
<- c("area_id",
selected_cols "fat", "saturate", "salt", "protein", "sugar",
"protein", "carb", "fibre", "alcohol")
View(df[selected_cols])
# or
%>% select(all_of(selected_cols)) %>% View() df
Create an initial empty XML document with just the tag
First, you will need to load this new library called xml2
:
library(xml2)
Then, you can create an empty XML document with the xml_new_root()
function:
#### PART 2: Working with XML ####
# Specifying the encoding is optional but important to avoid encoding issues
<- xml_new_root("data", .encoding = "UTF-8")
tesco_data_xml tesco_data_xml
Create a new XML node for each row of the data
<- df %>% slice(1)
row1
# Create a new XML node.
<- xml2::xml_new_root("area", area_id = row1$area_id)
area_node area_node
If you were to save this to a .xml
file (with xml2::write_xml(area_node, 'sample.xml')
) and open it in a text editor, you would see something like this:
<?xml version="1.0" encoding="UTF-8"?>
area area_id="E01000001"/> <
Note that the area_id
attribute is not a child node, but an attribute of the area
node. The tag area
has no children and for that reason, the xml2 package automatically closes it with a /
at the end: <area ... />
.
Click here to see alternative ways to do the same thing
You didn’t need to create a new variable row1
to store the first row of the data. You could have done it directly in the xml_new_root()
function:
# Alternative 1: (advanced) Pure pipe chain
<-
area_node ::xml_new_root("area",
xml2area_id = df %>% slice(1) %>% pull(area_id))
The pull
function is a shortcut for dplyr::pull(df, area_id)
. It extracts the area_id
column from the data frame and returns it as a vector.
Another alternative is to just use base R, which will probably be easier to understand:
# Alternative 2: (easier) Base R
<- xml2::xml_new_root("area", area_id = df[1, "area_id"]) area_node
Now, add the other nutrients as ‘children’ of the area
node
Here we will use a for
loop to iterate over the nutrient names and add them as children of the area
node:
# Create a new node for the nutrients
for(nutrient_name in selected_cols[-1]){
xml_add_child(area_node, nutrient_name, row1[[nutrient_name]])
}
(In the future, we will show you a more advanced way to do this using the purrr
package.)
If you were to save this to a .xml
file (with xml2::write_xml(area_node, 'sample.xml')
) and open it in a text editor, you would see something like this:
<?xml version="1.0" encoding="UTF-8"?>
area area_id="E01000001">
<fat>9.02807973835848</fat>
<saturate>3.7293430929761</saturate>
<salt>0.556402429528114</salt>
<protein>5.38504905777922</protein>
<sugar>9.65265534963392</sugar>
<protein>5.38504905777922</protein>
<carb>16.237019155895</carb>
<fibre>1.67400716399314</fibre>
<alcohol>0.347539336551938</alcohol>
<area> </
Then add the area_node
to the root node:
::xml_add_child(tesco_data_xml, area_node) xml2
Then, save the XML:
::write_xml(tesco_data_xml, "sample_tesco_data.xml") xml2
If you open this file on a text editor, you will see something like this:
<?xml version="1.0" encoding="UTF-8"?>
data>
<area area_id="E01000001">
<fat>9.02807973835848</fat>
<saturate>3.7293430929761</saturate>
<salt>0.556402429528114</salt>
<protein>5.38504905777922</protein>
<sugar>9.65265534963392</sugar>
<protein>5.38504905777922</protein>
<carb>16.237019155895</carb>
<fibre>1.67400716399314</fibre>
<alcohol>0.347539336551938</alcohol>
<area>
</data> </
Part 3
Part 3
This part calls for a custom function. As you will need to run the same code again and again for each row in the dataset, it makes sense to create a function that does this for you. Let’s call it get_area_node
:
<- function(row){
get_area_node <- xml2::xml_new_root("area", area_id = row$area_id)
area_node
for(nutrient_name in selected_cols[-1]){
xml_add_child(area_node, nutrient_name, row[[nutrient_name]])
}
return(area_node)
}
Then, you will need an external for
loop to iterate over the rows of the data frame:
<- xml_new_root("data", .encoding = "UTF-8")
tesco_data_xml # In future labs (Week 02 onwards), we will stop using for loops
# and use purrr::map() and derivatives
for (i in 1:10) {
%>% xml_add_child(get_area_node(df[i, ]))
tesco_data_xml
}
write_xml(tesco_data_xml, "sample_tesco_data.xml")
Part 4
Part 4
All you have to change is the get_area_node()
to add a <nutrients> tag:
<- function(row){
get_area_node <- xml2::xml_new_root("area", area_id = row$area_id)
area_node
# Create the 'nutrients' parent node
<- xml2::xml_add_child(area_node, "nutrients")
nutrients_node
for(nutrient_name in selected_cols[-1]){
xml_add_child(nutrients_node, nutrient_name, row[[nutrient_name]])
}
return(area_node)
}
The rest of the code is the same as before:
<- xml_new_root("data", .encoding = "UTF-8")
tesco_data_xml for (i in 1:10) {
%>% xml_add_child(get_area_node(df[i, ]))
tesco_data_xml
}
write_xml(tesco_data_xml, "sample_tesco_data.xml")
🏡 Bonus Task
🏡 Bonus Task
Full solution:
library(xml2)
library(dplyr)
#### CONSTANTS ####
<- c("fat", "saturate", "salt",
NUTRIENTS "protein", "sugar", "protein",
"carb", "fibre", "alcohol")
<- c("", "std", "ci95",
STATS_SUFFIXES "perc2.5", "perc25",
"perc50", "perc75",
"perc97.5")
#### FUNCTIONS ####
<- function(row, nutrient_name){
get_nutrient_node <- xml_new_root(nutrient_name)
nutrient_node <- if_else(STATS_SUFFIXES == "",
stat_col_names
nutrient_name,paste(nutrient_name, STATS_SUFFIXES, sep="_"))
for(stat_col in stat_col_names){
<- xml_new_root(stat_col, row[[stat_col]])
stat_element ::xml_add_child(nutrient_node, stat_element)
xml2
}
return(nutrient_node)
}
<- function(row){
get_area_node <- xml2::xml_new_root("area", area_id = row$area_id)
area_node
for(nutrient_name in NUTRIENTS){
::xml_add_child(area_node, get_nutrient_node(row, nutrient_name))
xml2
}
return(area_node)
}
#### MAIN ####
<- read_csv("2023/data/tesco-grocery/Dec_lsoa_grocery.csv")
df
<- xml_new_root("data", .encoding = "UTF-8")
tesco_data_xml for (i in 1:10) {
%>% xml_add_child(get_area_node(df[i, ]))
tesco_data_xml
}
write_xml(tesco_data_xml, "sample_tesco_data.xml")