<- function(current_div_summary){
parse_div_summary <-
one_summary_dict data.frame(unit=current_div_summary %>%
html_element("span.s-post-summary--stats-item-unit") %>%
html_text(),
number=current_div_summary %>%
html_element("span.s-post-summary--stats-item-number") %>%
html_text()
)
one_summary_dict
}
<- function(current_post){
parse_all_summaries
<- current_post %>% html_elements("div.s-post-summary--stats-item")
all_div_summaries
<- lapply(all_div_summaries, parse_div_summary)
all_summaries <- bind_rows(all_summaries) # R's equivalent to python's pd.concat
all_summaries
all_summaries
}
<- function(current_post){
parse_post_div
<-
post_title %>%
current_post html_element("h3.s-post-summary--content-title") %>%
html_text() %>%
str_trim()
<-
post_time %>%
current_post html_element("time span") %>%
html_attr("title")
<- parse_all_summaries(current_post)
df_summary
# We can reuse the dataframe that returned from `parse_all_summaries`
$post_title <- post_title
df_summary$post_time <- post_time
df_summary
df_summary
}
<- function(response_html){
parse_all_posts
<-
all_posts html_element(response_html, css="div#questions") %>%
html_elements(css="div.s-post-summary")
<- bind_rows(lapply(all_posts, parse_post_div))
df_results
# Optional: If you are also annoyed at the zero indices
# df_results.reset_index(drop=True, inplace=True)
df_results
}
👨🏫 Week 08 - (Re-)shaping data, data normalisation and databases
DS105 - Data for Data Science
Setup
You might need to open the terminal to install packages if an import
does not work.
💡 Tip: Enable”Toogle Auto-Numbering” on the left-side corner of Jupyter Lab user interface.
Import packages
import numpy as np
import pandas as pd
# import the required libraries
import requests
from bs4 import BeautifulSoup
# Importing plotnine like this will make your life easier
from plotnine import *
library(tidyverse) #rvest is part of tidyverse
Data Collection
Web Scraping: Frequent ‘python’ Questions
# sending a request to the site
= "https://stackoverflow.com/questions/tagged/python"
base_url = requests.get(base_url) response_html
# read the content of the site
<- "https://stackoverflow.com/questions/tagged/R"
base_url <- read_html(base_url) response_html
The response_html
object returned by the rvest::read_html
function is not as easy to explore as the one we get with the Python’s BeautifulSoup version of this tutorial. But they encode the same info.
How many lines does this page has?
len(response_html.text)
print(response_html)
Keep a tab with the documentation of rvest to consult how to do things: rvest documentation.
What will we extract?
Let’s extract the questions and related statistics. Let’s explore how you would do that:
Step-by-Step is the way to go
💡Here’s the key takeaway of this session: don’t write code to collect all the questions. Write code to collect ONLY ONE question first.
Extracting the right div
s
After exploring the HTML, I found out I want all the <div class="s-post-summary">
inside the main <div id="questions">
:
# parsing page"s content
= BeautifulSoup(response_html.text)
soup
= soup.find("div", id="questions").find_all("div", class_="s-post-summary")
all_posts
print(f"We found {len(all_posts)} posts")
<-
all_posts html_element(response_html, css="div#questions") %>%
html_elements(css="div.s-post-summary")
cat(paste0("We found ", length(all_posts) , " posts")) # cat is another way to `print()` stuff
💡 Reinforcing the point above: even though we have 50 posts, let’s collect ONLY ONE first.
What is inside each div
?
= all_posts[0] # Select just the first one
current_post current_post
💡 Here’s a trick, you can use IPython.display to visualise the HTML snippet:
from IPython.display import display, HTML
str(current_post)) HTML(
<- all_posts[[1]] # Select just the first one
current_post %>% xml2::html_structure() current_post
Another way to explore this data more visually is to save it and open it on a code editor such as VScode (We used it in the 💻 Week 07 - Lab Roadmap)
Let’s break it all down to get the summary statistics
"div", class_="s-post-summary--stats-item")[0] current_post.find_all(
%>% html_elements(css="div.s-post-summary--stats-item") current_post
There are two ways to get the info I want:
- I could extract the element
title
inside of the<div>
- I could extract the text inside the
<span>
elements.
⁉️ Still puzzled about what some HTML element tags represent? The book Duckett, Jon. HTML and CSS: Design and Build Websites. New York: Wiley, 2011. is a great resource.
Let’s do it the hard way, extract from <span>
. I will show how to approach this with a for
loop first, since this is probably how you learned how to do things with programming.
If I just want to print stuff, I could just navigate the tree:
= current_post.find_all("div", class_="s-post-summary--stats-item")[0]
current_div_summary current_div_summary
<- current_post %>% html_elements(css="div.s-post-summary--stats-item")
current_div_summary <- current_div_summary[[1]]
current_div_summary current_div_summary
Let’s get the info we want:
"span",class_="s-post-summary--stats-item-unit").get_text() current_div_summary.find(
"span", class_="s-post-summary--stats-item-number").get_text() current_div_summary.find(
This would be better if we saved it in a pandas Series:
= "span.s-post-summary--stats-item-unit"
item_unit_css = "span.s-post-summary--stats-item-number"
item_num_css
= {"unit": current_div_summary.find("span", class_=item_unit_css).get_text(),
one_summary_dict "number": current_div_summary.find("span", class_=item_num_css).get_text()}
one_summary_dict
pd.Series(one_summary_dict)
Or a pandas DataFrame
=[0]) # do you know why we need the `index=[0]` bit here? pd.DataFrame(one_summary_dict, index
%>% html_element("span.s-post-summary--stats-item-unit") %>% html_text() current_div_summary
%>% html_element("span.s-post-summary--stats-item-number") %>% html_text() current_div_summary
This would be better if we saved it in a named list:
<- "span.s-post-summary--stats-item-unit"
item_unit_css <- "span.s-post-summary--stats-item-number"
item_num_css
<-
one_summary_dict data.frame(unit=current_div_summary %>% html_element(item_unit_css) %>% html_text(),
number=current_div_summary %>% html_element(item_num_css) %>% html_text()
) one_summary_dict
Putting the pieces of the puzzle together: gathering summary statistics for a single post
However, the part people struggle the most is figuring out how to pre-process and store the data correctly.
If you follow the strategy above, it will be much easier to think about that. Let’s explore together how we would gather all statistics from a current_div_summary
:
# an empty list that will be filled with multiple dictionaries
# (do you see why? ask your lecturer if not)
= []
all_stats
for current_div_summary in current_post.find_all("div", class_="s-post-summary--stats-item"):
= {"unit": current_div_summary.find("span",class_=item_unit_css).get_text(),
one_summary_dict "number": current_div_summary.find("span", class_=item_num_css).get_text()}
all_stats.append(one_summary_dict)
# What is in `all_stats` now?
all_stats
What happens if we try to convert it to a Series?
pd.Series(all_stats)
What happens if we try to convert it to a DataFrame?
pd.DataFrame(all_stats)
# an empty dataframe that will be filled with multiple named lists
# (do you see why? ask your lecturer if not)
= data.frame()
all_stats
<- current_post %>% html_elements("div.s-post-summary--stats-item")
all_div_summaries
for(j in 1:length(all_div_summaries)){
<- all_div_summaries[[j]]
current_div_summary
<-
one_summary_dict data.frame(unit=current_div_summary %>%
html_element("span.s-post-summary--stats-item-unit") %>%
html_text(),
number=current_div_summary %>%
html_element("span.s-post-summary--stats-item-number") %>%
html_text()
)
<- bind_rows(all_stats, one_summary_dict)
all_stats
}
# What is in `all_stats` now?
all_stats
🎯 ACTION POINT: Would you organise this dataframe differently?
Going one step up: gathering summary statistics for ALL posts
How would we gather the summary for ALL posts?
= []
all_stats for current_post in all_posts:
for current_div_summary in current_post.find_all("div", class_="s-post-summary--stats-item"):
= {"unit": current_div_summary.find("span",class_=item_unit_css).get_text(),
one_summary_dict "number": current_div_summary.find("span", class_=item_num_css).get_text()}
all_stats.append(one_summary_dict)
# What is in `all_stats` now?
all_stats
- Time to learn about
bind_rows
= data.frame()
all_stats
for(i in 1:length(all_posts)){
<- all_posts[[i]]
current_post
<- current_post %>% html_elements("div.s-post-summary--stats-item")
all_div_summaries
for(i in 1:length(all_div_summaries)){
<- all_div_summaries[[i]]
current_div_summary
<-
one_summary_dict data.frame(unit=current_div_summary %>%
html_element("span.s-post-summary--stats-item-unit") %>%
html_text(),
number=current_div_summary %>%
html_element("span.s-post-summary--stats-item-number") %>%
html_text()
)
<- bind_rows(all_stats, one_summary_dict)
all_stats
}
}
all_stats
🤔 Q: What is the problem with the dictionary above?
pd.DataFrame(all_stats)
A: Well, I have the statistics about all the 50 posts but I don’t know which one is which!
Let’s fix that:
= []
all_stats for current_post in all_posts:
= current_post.find("h3", class_="s-post-summary--content-title").get_text().strip()
post_title = current_post.find("time").span["title"]
post_time
for current_div_summary in current_post.find_all("div", class_="s-post-summary--stats-item"):
= {"post_title" : post_title,
one_summary_dict "post_time" : post_time,
"unit" : current_div_summary.find("span",class_=item_unit_css).get_text(),
"number" : current_div_summary.find("span", class_=item_num_css).get_text()}
all_stats.append(one_summary_dict)
# What is in `all_stats` now?
all_stats
pd.DataFrame(all_stats)
- Time to learn about str_trim (also a tidyverse function)
= data.frame()
all_stats
for(i in 1:length(all_posts)){
<- all_posts[[i]]
current_post <-
post_title %>%
current_post html_element("h3.s-post-summary--content-title") %>%
html_text() %>%
str_trim()
# Notice anything different in the html_element here?
# Raise your hand and call your lecture if you don't see any difference.
<-
post_time %>%
current_post html_element("time span") %>%
html_attr("title")
<- current_post %>% html_elements("div.s-post-summary--stats-item")
all_div_summaries
for(i in 1:length(all_div_summaries)){
<- all_div_summaries[[i]]
current_div_summary
<-
one_summary_dict data.frame(unit=current_div_summary %>%
html_element("span.s-post-summary--stats-item-unit") %>%
html_text(),
number=current_div_summary %>%
html_element("span.s-post-summary--stats-item-number") %>%
html_text() ,
post_title=post_title,
post_time=post_time
)
<- bind_rows(all_stats, one_summary_dict)
all_stats
}
}
all_stats
(Detour) Plotting distribution of these statistics we collected
= pd.DataFrame(all_stats)
plot_df "number"] = plot_df["number"].astype(int)
plot_df[
= (
g ="unit", y="number", group="unit", fill="unit"))
ggplot(plot_df, aes(x+ geom_boxplot()
)
g
<- all_stats
plot_df $number <- as.integer(plot_df$number)
plot_df
= (
g ggplot(plot_df, aes(x=unit, y=number, group=unit, fill=unit))
+ geom_boxplot()
)
g
What if I want to change the order of the X axis?
For this, you’ll need to convert unit
(what we are mapping to the X-axis) to a Categorical (or factor in R).
= pd.DataFrame(all_stats)
plot_df "number"] = plot_df["number"].astype(int)
plot_df["unit"] = pd.Categorical(plot_df["unit"], categories=["views", "vote", "votes", "answer", "answers"], ordered=True)
plot_df[
= (
g ="unit", y="number", group="unit", fill="unit"))
ggplot(plot_df, aes(x+ geom_boxplot()
)
g
<- all_stats
plot_df $number <- as.integer(plot_df$number)
plot_df$unit <- factor(plot_df$unit, levels=c("views", "vote", "votes", "answer", "answers"), ordered=True)
plot_df
= (
g ggplot(plot_df, aes(x=unit, y=number, group=unit, fill=unit))
+ geom_boxplot()
)
g
Functions
Don’t underestimate the power of functions!
In fact, you might find it easier to work with them than with for
loops.
A function just to parse the current_div_summary
Check numpy docstring convention for documenting functions.
def parse_div_summary(current_div_summary):
"""
# Let's write documentation together
"""
<- "span.s-post-summary--stats-item-unit"
item_unit_css <- "span.s-post-summary--stats-item-number"
item_num_css
= pd.DataFrame({"unit" : current_div_summary.find("span",class_=item_unit_css).get_text(),
one_summary_dict "number" : current_div_summary.find("span", class_=item_num_css).get_text()}, index=[0])
return one_summary_dict
parse_div_summary(current_div_summary)
Check roxygen2 convention for documenting functions
#'
#' Let's write documentation together
#'
<- function(current_div_summary){
parse_div_summary <-
one_summary_dict data.frame(unit=current_div_summary %>%
html_element("span.s-post-summary--stats-item-unit") %>%
html_text(),
number=current_div_summary %>%
html_element("span.s-post-summary--stats-item-number") %>%
html_text()
)
one_summary_dict }
parse_div_summary(current_div_summary)
A function to parse all summaries from a current_post
Time to learn about pd.concat()
. And to revisit Python’s list comprehension feature.
def parse_all_summaries(current_post):
= pd.concat([parse_div_summary(div_summary)
all_summaries for div_summary in
"div", class_="s-post-summary--stats-item")])
current_post.find_all(return all_summaries
parse_all_summaries(current_post)
Time to learn about lapply
<- function(current_post){
parse_all_summaries
<- current_post %>% html_elements("div.s-post-summary--stats-item")
all_div_summaries
<- lapply(all_div_summaries, parse_div_summary)
all_summaries <- bind_rows(all_summaries) # R's equivalent to python's pd.concat
all_summaries
all_summaries }
parse_all_summaries(current_post)
A function to parse other info about a current_post
Time to learn how to expand a dataframe.
def parse_post_div(current_post):
= current_post.find("h3", class_="s-post-summary--content-title").get_text().strip()
post_title = current_post.find("time").span["title"]
post_time
= parse_all_summaries(current_post)
df_summary # We can reuse the dataframe that returned from `parse_all_summaries`
"post_title"] = post_title
df_summary["post_time"] = post_time
df_summary[return df_summary
parse_post_div(current_post)
<- function(current_post){
parse_post_div
<-
post_title %>%
current_post html_element("h3.s-post-summary--content-title") %>%
html_text() %>%
str_trim()
<-
post_time %>%
current_post html_element("time span") %>%
html_attr("title")
<- parse_all_summaries(current_post)
df_summary
# We can reuse the dataframe that returned from `parse_all_summaries`
$post_title <- post_title
df_summary$post_time <- post_time
df_summary
df_summary
}
parse_post_div(current_post)
A function to parse ALL posts
Instead of growing a list using for
loops, let’s use list comprehension and use pd.concat()
Time to learn about reset_index()
and inplace=True
.
def parse_all_posts(soup):
= soup.find("div", id="questions").find_all("div", class_="s-post-summary")
all_posts = pd.concat([parse_post_div(current_post) for current_post in all_posts])
df_results
# Optional: If you are also annoyed at the zero indices
# df_results.reset_index(drop=True, inplace=True)
return df_results
parse_all_posts(soup)
<- function(response_html){
parse_all_posts
<-
all_posts html_element(response_html, css="div#questions") %>%
html_elements(css="div.s-post-summary")
<- bind_rows(lapply(all_posts, parse_post_div))
df_results
# Optional: If you are also annoyed at the zero indices
# df_results.reset_index(drop=True, inplace=True)
df_results
}
parse_all_posts(response_html)
Putting it all together
- Use Jupyter Notebook/Google Colab for prototyping, but not for reusing code
- Store your functions in a separate file
This is all the code we created:
import pandas as pd
from bs4 import BeautifulSoup
def parse_div_summary(current_div_summary):
"""
# Let's write documentation together
"""
= pd.DataFrame({"unit" : current_div_summary.find("span",class_=item_unit_css).get_text(),
one_summary_dict "number" : current_div_summary.find("span", class_=item_num_css).get_text()}, index=[0])
return one_summary_dict
def parse_all_summaries(current_post):
= pd.concat([parse_div_summary(div_summary)
all_summaries for div_summary in
"div",class_="s-post-summary--stats-item")])
current_post.find_all(return all_summaries
def parse_post_div(current_post):
= current_post.find("h3", class_="s-post-summary--content-title").get_text().strip()
post_title = current_post.find("time").span["title"]
post_time
= parse_all_summaries(current_post)
df_summary # We can reuse the dataframe that returned from `parse_all_summaries`
"post_title"] = post_title
df_summary["post_time"] = post_time
df_summary[return df_summary
def parse_all_posts(soup):
= soup.find("div", id="questions").find_all("div", class_="s-post-summary")
all_posts = pd.concat([parse_post_div(current_post) for current_post in all_posts])
df_results
# Optional: If you are also annoyed at the zero indices
# df_results.reset_index(drop=True, inplace=True)
return df_results
🎯 ACTION POINT:
- In the same folder as this notebook, create a new folder and call it
stackparser
- Create an empty file at
stackparser/__init__.py
(it has to be named exactly like that) - Move the code above to a
stackparser/questions.py
file - Restart the Python session
🎯 ACTION POINT:
- Clear the environment:
rm(list=ls())
Read more about the rm
function like here
- In the same folder as this notebook, create a new file
stackparser_questions.R
The code below should return the same df
as we have been creating:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import stackparser # NOTE THIS LINE
import stackparser.questions # NOTE THIS LINE
= requests.get("https://stackoverflow.com/questions/tagged/python")
response_html = BeautifulSoup(response_html.text)
soup
= stackparser.questions.parse_all_posts(soup)
df df
source("stackparser_questions.R")
<- read_html("https://stackoverflow.com/questions/tagged/R")
response_html <- parse_all_posts(response_html)
df head(df)
Resulting DataFrame
We don’t need to run and re-run code anymore. All we need to do is invoke the function parse_all_posts
from our stackparser
package:
= stackparser.questions.parse_all_posts(soup)
df df
<- read_html("https://stackoverflow.com/questions/tagged/R")
response_html <- parse_all_posts(response_html)
df dim(df)
%>% str df
What does the above mean?
Pre-processing the data
Convert column time
to a proper datetime type
- Using pandas time series capabilities
- Using pandas
assign
function - Read more about python lambda functions
# Luckily the column `time` is already well formatted:
"post_time"]) pd.to_datetime(df[
We can use assign
to add a column to a dataframe:
=lambda x: pd.to_datetime(x["post_time"])) df.assign(post_time_formatted
If we want to store this new column, we need to re-assign the dataframe df
:
df = df.assign(post_time_formatted=lambda x: pd.to_datetime(x["post_time"]))
df.dtypes
- Using the lubridate package
library(lubridate)
# Luckily the column `post_time` is already well formatted:
%>% mutate(post_time_formatted=ymd_hms(post_time)) df
If we want to store this new column, we need to re-assign the dataframe df
<- df %>% mutate(post_time_formatted=ymd_hms(post_time)) df
%>% str df
Let’s create three columns: votes
, answers
, views
- That is a task for pandas pivot_table
="number", columns="unit", index=["post_title", "post_time_formatted"]) df.pivot_table(values
- That is a task for tidyr pivot_wider (part of tidyverse)
%>% pivot_wider(id_cols=c(post_title, post_time_formatted),
df names_from=unit,
values_from=number)
What can we improve about the DF above?
🤝 WORKING TOGETHER
- Modify the functions we created in the
stackparser/questions.py
file so that:answer
is renamed toanswers
vote
is renamed tovotes
- Recreate the Dataframe (code below) and see the difference!
Sadly, Jupyter will not recognise your changes unless you restart and re-run the Python session. You can do that, or, alternatively, add a chunk of code to the top of your notebook and add the two lines below to “reload” functions.
%load_ext autoreload
%autoreload 2
import requests
import pandas as pd
from bs4 import BeautifulSoup
import stackparser
from stackparser.questions import parse_all_posts
= requests.get("https://stackoverflow.com/questions/tagged/python")
response_html = BeautifulSoup(response_html.text)
soup
# Note that we piped the output of `parse_all_posts()` to the `assign` function.
# That is very common to do with Pandas
= (
df
parse_all_posts(soup)=lambda x: pd.to_datetime(x["post_time"]))
.assign(post_time_formatted="number", columns="unit", index=["post_title", "post_time_formatted"])
.pivot_table(values
)
df
<- read_html("https://stackoverflow.com/questions/tagged/R")
response_html <- parse_all_posts(response_html)
df
<-
df_formatted %>%
df mutate(post_time_formatted=ymd_hms(post_time)) %>%
pivot_wider(id_cols=c(post_title, post_time_formatted),
names_from=unit,
values_from=number)
print(paste("Column names: ", paste(names(df_formatted), collapse=", ")))
# or View(df_formatted) df_formatted