library(tidyverse)library(lubridate)library(httr2)#' Extract the title, description and views from an article#' #' Note that this function is generic enough to work with any named list that contains#' a title, description and views. However, it is designed to work with the output of#' the Wikimedia API, which is why the parameter is called `article`.#' #' @param article A list containing, amongst other things, the title, description and views of an article#' #' @return A tibble with three columns: title, description and views#' #' @examples#' extract_article_info(list(title="Hello", description="World", views=1000))#' extract_article_info <-function(article){tibble(title=article$title, description=article$description, views=article$views)}#' Get the most read articles from the English Wikipedia on a given day#'#' @param ymd_date A lubridate date object#' @param access_token The access token to use to authenticate with the Wikimedia API.#' To obtain an access token from Wikimedia, see the instructions#' on https://api.wikimedia.org/wiki/Getting_started_with_Wikimedia_APIs#' #' @return A tibble with four columns: title, description, views and date#' #' @examples#' get_most_read_en_articles(ymd("2023-07-16"), access_token)#' get_most_read_en_articles <-function(ymd_date, access_token){# To get the most read articles of a given day, we need the URL to point to the following day# It's awkward, but we can't control it, that's how the API works url_date <- ymd_date + lubridate::days(1) url <-sprintf("https://api.wikimedia.org/feed/v1/wikipedia/en/featured/%04d/%02d/%02d",year(url_date), month(url_date), day(url_date)) response <-request(url) %>%req_auth_bearer_token(access_token) %>%req_perform() json_response <- response %>%resp_body_json() most_read <- json_response[['mostread']][['articles']]# We use lapply to apply the extract_article_info function to each article# This returns a list of tibbles# We then use bind_rows to combine all the tibbles into one single tibble df <-lapply(most_read, extract_article_info) %>%bind_rows()# It's more efficient to add the date column after the lapply loop df$date <- ymd_date df}#### MAIN SCRIPT ####access_token <- readr::read_file("my_secret_access_token.txt")df_most_read <-get_most_read_en_articles(ymd("2023-07-16"), access_token)
🏡 Bonus Task
first_january_2023 <- lubridate::ymd("2023-01-01") yesterday <- lubridate::today() - lubridate::days(1)all_dates <-seq(first_january_2023, yesterday, by="1 day")# The following code is slow, because it makes one API call per day of the yeardf_all_dates <-lapply(all_dates, get_most_read_en_articles, access_token=access_token) %>%bind_rows()
library(pbapply) # Simply replace lapply with pblapplyfirst_january_2023 <- lubridate::ymd("2023-01-01") yesterday <- lubridate::today() - lubridate::days(1)all_dates <-seq(first_january_2023, yesterday, by="1 day")# The following code is slow, because it makes one API call per day of the yeardf_all_dates <-pblapply(all_dates, get_most_read_en_articles, access_token=access_token) %>%bind_rows()