Basic Data Wrangling and Plotting Tutorial

Hongtao Hao / 2022-10-05

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

Download data here: aiddata.csv

Data import and basic manipulation #

df = pd.read_csv('../static/files/aiddata.csv')
# df.head()

# don't actually need the first two columns:
df = df.iloc[:, 2:]

# rename the columns
df.columns = ['year', 'donor', 'recipient', 'amount', 'purpose_code', 'purpose_name']
# df.head()

# check the shape
df.shape

# close to 10K rows!
(98540, 6)
# check year range

min(df.year), max(df.year)
(1973, 2013)

Task 1 #

The first task is: for each country or region, visualize the difference between its donation and receiving, for each year. To do that, we need a dataframe that has these three columns:

Then, we can visualize the difference by year for each country or region (hereafter referred as “country”).

There are two obvious challenges:

Let’s first get donation and receiving data from the source data.

def parse_donation_and_receiving(df, var):
    """
    To get donationn or receiving data from df
    
    Input:
        - df: df
        - var: either 'donor' or 'recipient'
    
    Return:
        a dataframe
    """
    data = []
    for group in df.groupby(['year', var]):
        # total amount of donation or receiving in that year for this country
        total_amount = group[1].amount.sum()
        # the amount is so big, we use the unit of billio (10**9)
        total_amount = total_amount / 10**9
        data.append((
            # year:
            group[0][0],
            # country:
            group[0][1],
            # amount:
            total_amount
        ))
    var_name = 'donation' if var == 'donor' else 'received'
    data_df = pd.DataFrame(
        data, columns = ['year', 'country', var_name]
    )
    return data_df
# We first get the donation data
donation_df = parse_donation_and_receiving(df, 'donor')
donation_df.head()

yearcountrydonation
01973Australia0.046286
11973Belgium0.039251
21973Canada0.437928
31973France0.247190
41973Germany0.562232
# Then we get the receiving data
receiving_df = parse_donation_and_receiving(df, 'recipient')
receiving_df.head()

yearcountryreceived
01973Brazil0.312075
11973Chile0.088056
21973Colombia0.549945
31973Cyprus0.009613
41973India2.285257

First, let’s look at how many countries are there:

all_cntry = list(df.donor) + list(df.recipient)
all_cntry = list(set(all_cntry))

# there are in total 47 unique countries
len(all_cntry)
47

We have 47 unique countries, but the problem is, in some years, not all countries are present, for example, in receiving_df, only 10 countries were present in the year of 1973.

receiving_df[receiving_df.year == 1973]

yearcountryreceived
01973Brazil0.312075
11973Chile0.088056
21973Colombia0.549945
31973Cyprus0.009613
41973India2.285257
51973Korea1.363707
61973Kuwait0.000325
71973Saudi Arabia0.000065
81973Thailand0.206363
91973United Arab Emirates0.000065
# same issue for donation data
donation_df[donation_df.year == 1973]

yearcountrydonation
01973Australia0.046286
11973Belgium0.039251
21973Canada0.437928
31973France0.247190
41973Germany0.562232
51973Italy0.166719
61973Japan0.938966
71973Netherlands0.162751
81973Norway0.035875
91973Sweden0.168369
101973Switzerland0.014061
111973United Kingdom0.442579
121973United States1.553264

You might ask: what will be the issue if not all countries are in each year. Good question! The issue is that if we do not have data for all countries each year, we are not able to get the difference between donation and receiving for each country in each year very easily. To do that, we need to “normalize” the time. If a country does not appear in a year, that means it does not recieve (or donate) any money in that year.

This is how I solved the problem:

def normalize_time(data, all_cntry, var_name):
    """normlize time for donation_df and receiving_df
    
    Input: 
        - data: either donation_df or receiving_df
        - all_cntry
        - var_name: either 'donation' or 'received'
    
    Output:
        - a dataframe
    """
    dfs = []
    for group in data.groupby('year'):
        year = group[0]
        present_cntry = group[1].country.tolist()
        absent_cntry = [x for x in all_cntry if x not in present_cntry]
        absent_df = pd.DataFrame({
            'year': year,
            'country': absent_cntry,
            var_name: 0
        })
        dff = pd.concat([group[1], absent_df], ignore_index = True)
        
        # by sorting the country, we make sure that rows in the output
        # donation and receiving dataframes correspond to each other
        dff.sort_values(by='country', ascending=True, inplace=True)
        dfs.append(dff)
    
    data_df = pd.concat(dfs, ignore_index = True)
    return data_df
donation = normalize_time(donation_df, all_cntry, var_name = 'donation')
donation.head()

yearcountrydonation
01973Australia0.046286
11973Austria0.000000
21973Belgium0.039251
31973Brazil0.000000
41973Canada0.437928
receiving = normalize_time(receiving_df, all_cntry, var_name = 'received')
receiving.head()

yearcountryreceived
01973Australia0.000000
11973Austria0.000000
21973Belgium0.000000
31973Brazil0.312075
41973Canada0.000000
# to check whether the countries in the two lists are the same
r_c = list(set(receiving.country))
d_c = list(set(donation.country))
r_c == d_c
True

Then we combine the two datasets and calculate the difference:

all_df = donation
all_df['received'] = receiving['received']
all_df['d_minus_r'] = all_df['donation'] - all_df['received']
all_df.head()

yearcountrydonationreceivedd_minus_r
01973Australia0.0462860.0000000.046286
11973Austria0.0000000.0000000.000000
21973Belgium0.0392510.0000000.039251
31973Brazil0.0000000.312075-0.312075
41973Canada0.4379280.0000000.437928
all_df.shape
(1927, 5)

To make the plot, we need to transform the format of year:

all_df['year'] = pd.to_datetime(all_df['year'], format='%Y')
alt.Chart(all_df).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'd_minus_r:Q',
        title = 'Donation - Received',
        axis=alt.Axis(
            titleColor='#5276A7',
            labelExpr = 'datum.value + "B `$"'
        ),
    ),
    facet = alt.Facet(
        "country:N",
        columns = 8,
        title = 'Country/Region',
        header = alt.Header(labelFontSize = 12)
    )
).properties(
    width = 120,
    height = 110,
).configure_header(
    titleColor='#5276A7',
    titleFontSize=16,
#     labelColor='red',
    labelFontSize=14
)

The problem with this plot is that with it, we are not able to see the actual donation and receiving. We can achieve this in the following way.

all_df_melted = pd.melt(all_df, 
        id_vars=['year', 'country'], 
        value_vars=['donation', 'received'],
        var_name = 'category',
        value_name = 'amount'
       )
all_df_melted.replace({
    'donation': 'Donation',
    'received': 'Received'
}, inplace = True)
all_df_melted.rename(
    columns = {
        'category': 'Category'
    }, inplace = True)
all_df_melted.head()

yearcountryCategoryamount
01973-01-01AustraliaDonation0.046286
11973-01-01AustriaDonation0.000000
21973-01-01BelgiumDonation0.039251
31973-01-01BrazilDonation0.000000
41973-01-01CanadaDonation0.437928

This is important: because we are going to plot both donaiton and received amount each year for each country, it is important to replace 0 with NaN. Because the figures will be very small, if we do not change 0 to NaN, then we will consider 0 as small amount of donation or received amount, which is misleading.

all_df_melted['amount'] = all_df_melted['amount'].replace(0, np.nan)
all_df_melted.head()

yearcountryCategoryamount
01973-01-01AustraliaDonation0.046286
11973-01-01AustriaDonationNaN
21973-01-01BelgiumDonation0.039251
31973-01-01BrazilDonationNaN
41973-01-01CanadaDonation0.437928
alt.Chart(all_df_melted).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'amount:Q',
        title = 'Amount',
        axis=alt.Axis(
            titleColor='#5276A7',
            labelExpr = 'datum.value + "B $`"'
        ),
    ),
    color = alt.Color(
        'Category:N',
    ),
    facet = alt.Facet(
        "country:N",
        columns = 8,
        title = 'Country/Region',
        header = alt.Header(labelFontSize = 12)
    )
).properties(
    width = 120,
    height = 110,
).configure_header(
    titleColor='#5276A7',
    titleFontSize=16,
#     labelColor='red',
    labelFontSize=14
)

Task 2 #

The second task is: find the top 10 coalesced purposes (according to the amount of donation) and for these ten purposes, show how their amount changes by time.

# First, find the top ten purposes
top_purpose_df = df.groupby('purpose_name')['amount'].agg('sum').nlargest(n=10)
top_10_purposes = list(top_purpose_df.index)
top_10_purposes
['Air transport',
 'Rail transport',
 'Industrial development',
 'Power generation/non-renewable sources',
 'RESCHEDULING AND REFINANCING',
 'Import support (capital goods)',
 'Social/ welfare services',
 'Telecommunications',
 'Power generation/renewable sources',
 'Sectors not specified']
# Then select only data involving these top purposes
# We only want three columns: year, amount, and purpose name
purpose_df = df[df.purpose_name.isin(top_10_purposes)][['year', 'amount', 'purpose_name']]
# So many rows:
purpose_df.shape
(17793, 3)
purpose_df.head()

yearamountpurpose_name
01977348718518.0Power generation/renewable sources
2198379371799.0Rail transport
31984212202942.0Rail transport
111976193588873.0Power generation/renewable sources
19198653561256.0Power generation/renewable sources
# For each purpose, we calculate the yearly sum:
purpose_dfs = []
for group in purpose_df.groupby('purpose_name'):
    purpose_name = group[0]
    group_df = group[1].groupby('year')['amount'].sum().to_frame()
    group_df.reset_index(inplace=True)
    group_df['purpose_name'] = group[0]
    purpose_dfs.append(group_df)
purpose = pd.concat(purpose_dfs, ignore_index = True)
purpose.head()

yearamountpurpose_name
019741.525672e+07Air transport
119751.582943e+07Air transport
219777.046280e+05Air transport
319797.342023e+07Air transport
419803.088275e+09Air transport
purpose.shape
(338, 3)
# change amount to the unit of billion
purpose['amount'] = purpose['amount'] / 10**9

# # change the format to be year
purpose['year'] = pd.to_datetime(purpose['year'], format='%Y')
max(purpose.amount)
8.344866729
# to change the purpose name
purpose_renamer_dic = {'Air transport': 'Air Transport',
 'Rail transport': 'Rail Transport',
 'Industrial development': 'Industrial Development',
 'Power generation/non-renewable sources': 'Power: Non-renewable',
 'RESCHEDULING AND REFINANCING': 'Rescheduling & Refinancing',
 'Import support (capital goods)': 'Import Support',
 'Social/ welfare services': 'Welfare Services',
 'Telecommunications': 'Telecommunications',
 'Power generation/renewable sources': 'Power: Renewable',
 'Sectors not specified': 'Not Specified'
 }
purpose.replace(purpose_renamer_dic, inplace = True)
# make the plot
alt.Chart(purpose).mark_line().encode(
    x = 'year',
    y = 'amount',
    color = 'purpose_name',
    strokeDash = 'purpose_name'
)

The problem with this chart is that it is so messy. We can visualize it in another way:

alt.Chart(purpose).mark_rect().encode(
    x = alt.X(
        'year(year)',
        title = 'Year'
    ),
    y = alt.Y(
        'purpose_name:N',
        title = 'Coalesced Purposes of Donation',
        sort=alt.EncodingSortField(field='amount', op='sum', order='descending')
    ),
    color = alt.Color(
        'amount:Q',
        title = 'Amount (in Billion $`)',
    )
).properties(
    width = 720,
    height = 300,
    title = 'Top Ten Coalesced Purposes by Total Donation Amount (1973-2013)',
).configure_axis(
    labelFontSize=11,
    titleFontSize=12
)
#DS

Last modified on 2022-10-16