In [20]:
#Necessary python libraries import (before import step, the installation of library is required in python environment)

import requests   #library for webscrapping
from bs4 import BeautifulSoup   #library for webscrapping
import pandas as pd   #basic library
import re   #library for regex (word matching)
import datetime   #library to use DateTime method

In [21]:
# URL of the webpage with the ongoing events
url = "https://en.wikipedia.org/wiki/Portal:Current_events"

# Send an HTTP GET request to the URL
response = requests.get(url)
print("response:", response)


response: <Response [200]>


In [22]:
# Check if the response code is 200
if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the div with role="region" and aria-labelledby="Ongoing_events"
    ongoing_events_div = soup.find('div', {'role': 'region', 'aria-labelledby': 'Ongoing_events'})

    if ongoing_events_div:
        headlines = []
        descriptions = []
        timestamps = []

        # Find all the h3 elements within the ongoing_events_div
        h3_elements = ongoing_events_div.find_all('h3')

        for h3 in h3_elements:
            # Get the headline text from the span with class "mw-headline"
            headline = h3.find('span', class_='mw-headline')
            if headline:
                headlines.append(headline.text.strip())
            else:
                headlines.append("No Headline")
            # Get the description text from the following sibling (next_element)
            description = h3.find_next_sibling().text.strip()
            if description:
                descriptions.append(description)
            else:
                descriptions.append("No Description")

            # Get the timestamp from the following sibling of the description
            timestamp = h3.find_next_sibling().find_next_sibling().text.strip()
            if timestamp:
                timestamps.append(timestamp)
            else:
                timestamps.append("No Timestamp")

        # Create a DataFrame with the columns
        df = pd.DataFrame({"Headline": headlines, "Description": descriptions, "Timestamp": timestamps})

        # Display the DataFrame
        print(df)
    else:
        print("Couldn't find the 'mw-collapsible current-events-sidebar' div on the page.")
else:
    print(f"HTTP request failed with status code {response.status_code}.")


    Headline                                        Description     Timestamp
0  Disasters  2023 Atlantic hurricane season\n2023 Pacific h...     Economics
1  Economics  2020–2023 global chip shortage\n2021–2023 glob...      Politics
2   Politics  Armenian protests\nBelarus–European Union bord...  edit section


In [23]:
#Another way of displaying the dataframe
df

Unnamed: 0,Headline,Description,Timestamp
0,Disasters,2023 Atlantic hurricane season\n2023 Pacific h...,Economics
1,Economics,2020–2023 global chip shortage\n2021–2023 glob...,Politics
2,Politics,Armenian protests\nBelarus–European Union bord...,edit section


In [24]:
# Currently only 3 rows, one for each Headline
# Next to split the clubbed values in the 'Description' Column

# Split the 'Description' column by newline character ('\n') and stack the resulting Series
split_descriptions = df['Description'].str.split('\n', expand=True).stack()

# Reset the index to match the original DataFrame's structure
split_descriptions = split_descriptions.reset_index(level=1, drop=True)

# Rename the Series to 'Description' to match the original DataFrame
split_descriptions.name = 'Description'

# Drop the original 'Description' column from the original DataFrame
df = df.drop('Description', axis=1)

# Concatenate the original DataFrame with the split 'Description' Series
df = pd.concat([df, split_descriptions], axis=1)

df

Unnamed: 0,Headline,Timestamp,Description
0,Disasters,Economics,2023 Atlantic hurricane season
0,Disasters,Economics,2023 Pacific hurricane season
0,Disasters,Economics,2023 Pacific typhoon season
0,Disasters,Economics,Opioid epidemic0(United States)
1,Economics,Politics,2020–2023 global chip shortage
1,Economics,Politics,2021–2023 global energy crisis
1,Economics,Politics,2021–2023 inflation surge
1,Economics,Politics,2022–2023 food crises
1,Economics,Politics,Great Resignation
1,Economics,Politics,Argentine monetary crisis


In [25]:
# Function to extract the year from the description
def extract_year(description):
    # Use regular expressions to find a 4-digit year in the description
    year_match = re.search(r'\b\d{4}\b', description)
    if year_match:
        return year_match.group(0)
    else:
        return None

# Create a new column 'Event Year' by applying the extract_year function to the 'Description' column
df['Event Start Year'] = df['Description'].apply(extract_year)

# Remove the year value from the 'Description' column
df['Description'] = df['Description'].apply(lambda x: re.sub(r'\b\d{4}\b', '', x) if pd.notna(x) else x)

df

Unnamed: 0,Headline,Timestamp,Description,Event Start Year
0,Disasters,Economics,Atlantic hurricane season,2023.0
0,Disasters,Economics,Pacific hurricane season,2023.0
0,Disasters,Economics,Pacific typhoon season,2023.0
0,Disasters,Economics,Opioid epidemic0(United States),
1,Economics,Politics,– global chip shortage,2020.0
1,Economics,Politics,– global energy crisis,2021.0
1,Economics,Politics,– inflation surge,2021.0
1,Economics,Politics,– food crises,2022.0
1,Economics,Politics,Great Resignation,
1,Economics,Politics,Argentine monetary crisis,


In [26]:
# Drop the "Timestamp" column
df = df.drop(columns=["Timestamp"])
df

Unnamed: 0,Headline,Description,Event Start Year
0,Disasters,Atlantic hurricane season,2023.0
0,Disasters,Pacific hurricane season,2023.0
0,Disasters,Pacific typhoon season,2023.0
0,Disasters,Opioid epidemic0(United States),
1,Economics,– global chip shortage,2020.0
1,Economics,– global energy crisis,2021.0
1,Economics,– inflation surge,2021.0
1,Economics,– food crises,2022.0
1,Economics,Great Resignation,
1,Economics,Argentine monetary crisis,


In [27]:
# Remove leading and trailing whitespace from all columns (if any)
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df

Unnamed: 0,Headline,Description,Event Start Year
0,Disasters,Atlantic hurricane season,2023.0
0,Disasters,Pacific hurricane season,2023.0
0,Disasters,Pacific typhoon season,2023.0
0,Disasters,Opioid epidemic0(United States),
1,Economics,– global chip shortage,2020.0
1,Economics,– global energy crisis,2021.0
1,Economics,– inflation surge,2021.0
1,Economics,– food crises,2022.0
1,Economics,Great Resignation,
1,Economics,Argentine monetary crisis,


In [28]:
# Remove special characters from the 'Description' column
df['Description'] = df['Description'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]+', '', x))
df

Unnamed: 0,Headline,Description,Event Start Year
0,Disasters,Atlantic hurricane season,2023.0
0,Disasters,Pacific hurricane season,2023.0
0,Disasters,Pacific typhoon season,2023.0
0,Disasters,Opioid epidemic0United States,
1,Economics,global chip shortage,2020.0
1,Economics,global energy crisis,2021.0
1,Economics,inflation surge,2021.0
1,Economics,food crises,2022.0
1,Economics,Great Resignation,
1,Economics,Argentine monetary crisis,


In [29]:
# Reset the DataFrame index
df = df.reset_index(drop=True)
df

Unnamed: 0,Headline,Description,Event Start Year
0,Disasters,Atlantic hurricane season,2023.0
1,Disasters,Pacific hurricane season,2023.0
2,Disasters,Pacific typhoon season,2023.0
3,Disasters,Opioid epidemic0United States,
4,Economics,global chip shortage,2020.0
5,Economics,global energy crisis,2021.0
6,Economics,inflation surge,2021.0
7,Economics,food crises,2022.0
8,Economics,Great Resignation,
9,Economics,Argentine monetary crisis,


In [30]:
# Function to extract the event location from the description
def extract_event_location(description):
    # Use regular expressions to find the location based on keywords
    location_match = re.search(r'(?:United|Sri|South|North)\s+\w+|\b(?:United|Sri|South|North)\b', description)
    
    #Note: We have only added 4 options in location_match variable, we can add other words such as: 
    #'New' for 'New Zealand',
    #'Costa' for 'Costa Rica',
    #'El' for 'El Salvador'
    
    if location_match:
        return location_match.group(0)
    else:
        # If none of the keywords are found, fetch the first word from the description
        words = description.split()
        if words:
            return words[0]
        else:
            return None

# Apply the extract_event_location function to the 'Description' column
df['Event Location'] = df['Description'].apply(extract_event_location)

df


Unnamed: 0,Headline,Description,Event Start Year,Event Location
0,Disasters,Atlantic hurricane season,2023.0,Atlantic
1,Disasters,Pacific hurricane season,2023.0,Pacific
2,Disasters,Pacific typhoon season,2023.0,Pacific
3,Disasters,Opioid epidemic0United States,,United States
4,Economics,global chip shortage,2020.0,global
5,Economics,global energy crisis,2021.0,global
6,Economics,inflation surge,2021.0,inflation
7,Economics,food crises,2022.0,food
8,Economics,Great Resignation,,Great
9,Economics,Argentine monetary crisis,,Argentine


In [31]:
location_mapping = {
    'Atlantic' : 'Atlantic Ocean',
    'Pacific' : 'Pacific Ocean',
    'French': 'France',
    'Argentine' : 'Argentina',
    'Lebanese' : 'Lebanon',
    'Pakistani' : 'Pakistan',
    'Peruvian' : 'Peru',
    'Ukrainian' : 'Ukraine',
    'Venezuelan' : 'Venezuela',
    'North Kosovo' : 'Kosovo',
    'Nigerien' : 'Niger',
    'Libyan' : 'Libya',
    'Israeli' : 'Israel',
    'Haitian' : 'Haiti',
    'Armenian': 'Armenia',
    'Turkish' : 'Turkey',
    'South African' : 'South Africa',
    'Sri Lankan' : 'Sri Lanka'
}

# Function to correct event locations to country names
def correct_event_location(event_location):
    return location_mapping.get(event_location, event_location)

# Apply the correct_event_location function to the 'Event Location' column
df['Correct Location'] = df['Event Location'].apply(correct_event_location)
df



Unnamed: 0,Headline,Description,Event Start Year,Event Location,Correct Location
0,Disasters,Atlantic hurricane season,2023.0,Atlantic,Atlantic Ocean
1,Disasters,Pacific hurricane season,2023.0,Pacific,Pacific Ocean
2,Disasters,Pacific typhoon season,2023.0,Pacific,Pacific Ocean
3,Disasters,Opioid epidemic0United States,,United States,United States
4,Economics,global chip shortage,2020.0,global,global
5,Economics,global energy crisis,2021.0,global,global
6,Economics,inflation surge,2021.0,inflation,inflation
7,Economics,food crises,2022.0,food,food
8,Economics,Great Resignation,,Great,Great
9,Economics,Argentine monetary crisis,,Argentine,Argentina


In [32]:
# Convert 'Correct Location' column to sentence case
df['Correct Location'] = df['Correct Location'].apply(lambda x: x.title() if pd.notna(x) else x)
df

Unnamed: 0,Headline,Description,Event Start Year,Event Location,Correct Location
0,Disasters,Atlantic hurricane season,2023.0,Atlantic,Atlantic Ocean
1,Disasters,Pacific hurricane season,2023.0,Pacific,Pacific Ocean
2,Disasters,Pacific typhoon season,2023.0,Pacific,Pacific Ocean
3,Disasters,Opioid epidemic0United States,,United States,United States
4,Economics,global chip shortage,2020.0,global,Global
5,Economics,global energy crisis,2021.0,global,Global
6,Economics,inflation surge,2021.0,inflation,Inflation
7,Economics,food crises,2022.0,food,Food
8,Economics,Great Resignation,,Great,Great
9,Economics,Argentine monetary crisis,,Argentine,Argentina


In [33]:
#Discuss the Correct Location values
df['Correct Location'].value_counts(dropna=False)

#which one are valid values or not?

United Kingdom     2
Global             2
Pakistan           2
Pacific Ocean      2
Belaruseuropean    1
Ukraine            1
Peru               1
Kosovo             1
Niger              1
Myanmar            1
Libya              1
Israel             1
Impeachment        1
Haiti              1
France             1
Atlantic Ocean     1
Armenia            1
Turkey             1
Sri Lanka          1
South Africa       1
Lebanon            1
Argentina          1
Great              1
Food               1
Inflation          1
United States      1
Venezuela          1
Name: Correct Location, dtype: int64

In [34]:
# Function to calculate duration and format the result
def calculate_duration(event_year):
    if pd.notna(event_year):
        current_year = datetime.datetime.now().year
        duration = current_year - int(event_year)
        if duration == 1 or duration == 0:
            return f"{duration} year"
        else:
            return f"{duration} years"
    else:
        return None

# Create a new column 'Duration' using the calculate_duration function
df['Duration'] = df['Event Start Year'].apply(calculate_duration)
df

Unnamed: 0,Headline,Description,Event Start Year,Event Location,Correct Location,Duration
0,Disasters,Atlantic hurricane season,2023.0,Atlantic,Atlantic Ocean,0 year
1,Disasters,Pacific hurricane season,2023.0,Pacific,Pacific Ocean,0 year
2,Disasters,Pacific typhoon season,2023.0,Pacific,Pacific Ocean,0 year
3,Disasters,Opioid epidemic0United States,,United States,United States,
4,Economics,global chip shortage,2020.0,global,Global,3 years
5,Economics,global energy crisis,2021.0,global,Global,2 years
6,Economics,inflation surge,2021.0,inflation,Inflation,2 years
7,Economics,food crises,2022.0,food,Food,1 year
8,Economics,Great Resignation,,Great,Great,
9,Economics,Argentine monetary crisis,,Argentine,Argentina,


In [35]:
#EXTRA (Just to showcase that the placing of columns can be changed easily)

# Get the current column names
columns = df.columns.tolist()

# Move 'Duration' column next to 'Event Year' column
columns.remove('Duration')
columns.insert(columns.index('Event Start Year') + 1, 'Duration')

# Reorder the columns in the DataFrame
df = df[columns]
df

#Points to discuss:
# Not all 'Correct Location' values are valid country/city names.
# 'Global', 'Food', 'Inflation', 'Belaruseuropean' : such values requires more in-depth look, not just webscrapping.

Unnamed: 0,Headline,Description,Event Start Year,Duration,Event Location,Correct Location
0,Disasters,Atlantic hurricane season,2023.0,0 year,Atlantic,Atlantic Ocean
1,Disasters,Pacific hurricane season,2023.0,0 year,Pacific,Pacific Ocean
2,Disasters,Pacific typhoon season,2023.0,0 year,Pacific,Pacific Ocean
3,Disasters,Opioid epidemic0United States,,,United States,United States
4,Economics,global chip shortage,2020.0,3 years,global,Global
5,Economics,global energy crisis,2021.0,2 years,global,Global
6,Economics,inflation surge,2021.0,2 years,inflation,Inflation
7,Economics,food crises,2022.0,1 year,food,Food
8,Economics,Great Resignation,,,Great,Great
9,Economics,Argentine monetary crisis,,,Argentine,Argentina
