✅ (Solutions) Lab 06

Author

Published

18 July 2023

Main Exercise

library(tidyverse)
library(lubridate)
library(httr2)

#' Extract the title, description and views from an article
#' 
#' Note that this function is generic enough to work with any named list that contains
#' a title, description and views. However, it is designed to work with the output of
#' the Wikimedia API, which is why the parameter is called `article`.
#' 
#' @param article A list containing, amongst other things, the title, description and views of an article
#' 
#' @return A tibble with three columns: title, description and views
#' 
#' @examples
#' extract_article_info(list(title="Hello", description="World", views=1000))
#' 
extract_article_info <- function(article){
  tibble(title=article$title, 
         description=article$description, 
         views=article$views)
}

#' Get the most read articles from the English Wikipedia on a given day
#'
#' @param ymd_date A lubridate date object
#' @param access_token The access token to use to authenticate with the Wikimedia API.
#'                     To obtain an access token from Wikimedia, see the instructions
#'                     on https://api.wikimedia.org/wiki/Getting_started_with_Wikimedia_APIs
#' 
#' @return A tibble with four columns: title, description, views and date
#' 
#' @examples
#' get_most_read_en_articles(ymd("2023-07-16"), access_token)
#' 
get_most_read_en_articles <- function(ymd_date, access_token){

  # To get the most read articles of a given day, we need the URL to point to the following day
  # It's awkward, but we can't control it, that's how the API works
  url_date <- ymd_date + lubridate::days(1)

  url <- 
    sprintf("https://api.wikimedia.org/feed/v1/wikipedia/en/featured/%04d/%02d/%02d",
            year(url_date), 
            month(url_date), 
            day(url_date))

  response <- 
    request(url) %>% 
    req_auth_bearer_token(access_token) %>% 
    req_perform()

  json_response <- response %>% resp_body_json()
  most_read <- json_response[['mostread']][['articles']]

  # We use lapply to apply the extract_article_info function to each article
  # This returns a list of tibbles
  # We then use bind_rows to combine all the tibbles into one single tibble
  df <- lapply(most_read, extract_article_info) %>% bind_rows()

  # It's more efficient to add the date column after the lapply loop
  df$date <- ymd_date

  df

}

#### MAIN SCRIPT ####

access_token <- readr::read_file("my_secret_access_token.txt")

df_most_read <- get_most_read_en_articles(ymd("2023-07-16"), access_token)

🏡 Bonus Task

first_january_2023 <- lubridate::ymd("2023-01-01") 
yesterday <- lubridate::today() - lubridate::days(1)

all_dates <- seq(first_january_2023, yesterday, by="1 day")

# The following code is slow, because it makes one API call per day of the year
df_all_dates <- 
    lapply(all_dates, get_most_read_en_articles, access_token=access_token) %>% 
    bind_rows()

Click here to see a version with a progress bar

library(pbapply) # Simply replace lapply with pblapply

first_january_2023 <- lubridate::ymd("2023-01-01") 
yesterday <- lubridate::today() - lubridate::days(1)

all_dates <- seq(first_january_2023, yesterday, by="1 day")

# The following code is slow, because it makes one API call per day of the year
df_all_dates <- 
    pblapply(all_dates, get_most_read_en_articles, access_token=access_token) %>% 
    bind_rows()