import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
Download data here: aiddata.csv
Data import and basic manipulation #
df = pd.read_csv('../static/files/aiddata.csv')
# df.head()
# don't actually need the first two columns:
df = df.iloc[:, 2:]
# rename the columns
df.columns = ['year', 'donor', 'recipient', 'amount', 'purpose_code', 'purpose_name']
# df.head()
# check the shape
df.shape
# close to 10K rows!
(98540, 6)
# check year range
min(df.year), max(df.year)
(1973, 2013)
Task 1 #
The first task is: for each country or region, visualize the difference between its donation and receiving, for each year. To do that, we need a dataframe that has these three columns:
- year
- country/region name
- donation
- receiving
- difference
Then, we can visualize the difference by year for each country or region (hereafter referred as “country”).
There are two obvious challenges:
- Right now, each row lists a donation-receiving pair. But what we want is: for each year, what’s the amount of donation and receiving this country has.
- Not all countries appeared in all years.
Let’s first get donation and receiving data from the source data.
def parse_donation_and_receiving(df, var):
"""
To get donationn or receiving data from df
Input:
- df: df
- var: either 'donor' or 'recipient'
Return:
a dataframe
"""
data = []
for group in df.groupby(['year', var]):
# total amount of donation or receiving in that year for this country
total_amount = group[1].amount.sum()
# the amount is so big, we use the unit of billio (10**9)
total_amount = total_amount / 10**9
data.append((
# year:
group[0][0],
# country:
group[0][1],
# amount:
total_amount
))
var_name = 'donation' if var == 'donor' else 'received'
data_df = pd.DataFrame(
data, columns = ['year', 'country', var_name]
)
return data_df
# We first get the donation data
donation_df = parse_donation_and_receiving(df, 'donor')
donation_df.head()
year | country | donation | |
---|---|---|---|
0 | 1973 | Australia | 0.046286 |
1 | 1973 | Belgium | 0.039251 |
2 | 1973 | Canada | 0.437928 |
3 | 1973 | France | 0.247190 |
4 | 1973 | Germany | 0.562232 |
# Then we get the receiving data
receiving_df = parse_donation_and_receiving(df, 'recipient')
receiving_df.head()
year | country | received | |
---|---|---|---|
0 | 1973 | Brazil | 0.312075 |
1 | 1973 | Chile | 0.088056 |
2 | 1973 | Colombia | 0.549945 |
3 | 1973 | Cyprus | 0.009613 |
4 | 1973 | India | 2.285257 |
First, let’s look at how many countries are there:
all_cntry = list(df.donor) + list(df.recipient)
all_cntry = list(set(all_cntry))
# there are in total 47 unique countries
len(all_cntry)
47
We have 47 unique countries, but the problem is, in some years, not all countries are present, for example, in receiving_df
, only 10 countries were present in the year of 1973.
receiving_df[receiving_df.year == 1973]
year | country | received | |
---|---|---|---|
0 | 1973 | Brazil | 0.312075 |
1 | 1973 | Chile | 0.088056 |
2 | 1973 | Colombia | 0.549945 |
3 | 1973 | Cyprus | 0.009613 |
4 | 1973 | India | 2.285257 |
5 | 1973 | Korea | 1.363707 |
6 | 1973 | Kuwait | 0.000325 |
7 | 1973 | Saudi Arabia | 0.000065 |
8 | 1973 | Thailand | 0.206363 |
9 | 1973 | United Arab Emirates | 0.000065 |
# same issue for donation data
donation_df[donation_df.year == 1973]
year | country | donation | |
---|---|---|---|
0 | 1973 | Australia | 0.046286 |
1 | 1973 | Belgium | 0.039251 |
2 | 1973 | Canada | 0.437928 |
3 | 1973 | France | 0.247190 |
4 | 1973 | Germany | 0.562232 |
5 | 1973 | Italy | 0.166719 |
6 | 1973 | Japan | 0.938966 |
7 | 1973 | Netherlands | 0.162751 |
8 | 1973 | Norway | 0.035875 |
9 | 1973 | Sweden | 0.168369 |
10 | 1973 | Switzerland | 0.014061 |
11 | 1973 | United Kingdom | 0.442579 |
12 | 1973 | United States | 1.553264 |
You might ask: what will be the issue if not all countries are in each year. Good question! The issue is that if we do not have data for all countries each year, we are not able to get the difference between donation and receiving for each country in each year very easily. To do that, we need to “normalize” the time. If a country does not appear in a year, that means it does not recieve (or donate) any money in that year.
This is how I solved the problem:
def normalize_time(data, all_cntry, var_name):
"""normlize time for donation_df and receiving_df
Input:
- data: either donation_df or receiving_df
- all_cntry
- var_name: either 'donation' or 'received'
Output:
- a dataframe
"""
dfs = []
for group in data.groupby('year'):
year = group[0]
present_cntry = group[1].country.tolist()
absent_cntry = [x for x in all_cntry if x not in present_cntry]
absent_df = pd.DataFrame({
'year': year,
'country': absent_cntry,
var_name: 0
})
dff = pd.concat([group[1], absent_df], ignore_index = True)
# by sorting the country, we make sure that rows in the output
# donation and receiving dataframes correspond to each other
dff.sort_values(by='country', ascending=True, inplace=True)
dfs.append(dff)
data_df = pd.concat(dfs, ignore_index = True)
return data_df
donation = normalize_time(donation_df, all_cntry, var_name = 'donation')
donation.head()
year | country | donation | |
---|---|---|---|
0 | 1973 | Australia | 0.046286 |
1 | 1973 | Austria | 0.000000 |
2 | 1973 | Belgium | 0.039251 |
3 | 1973 | Brazil | 0.000000 |
4 | 1973 | Canada | 0.437928 |
receiving = normalize_time(receiving_df, all_cntry, var_name = 'received')
receiving.head()
year | country | received | |
---|---|---|---|
0 | 1973 | Australia | 0.000000 |
1 | 1973 | Austria | 0.000000 |
2 | 1973 | Belgium | 0.000000 |
3 | 1973 | Brazil | 0.312075 |
4 | 1973 | Canada | 0.000000 |
# to check whether the countries in the two lists are the same
r_c = list(set(receiving.country))
d_c = list(set(donation.country))
r_c == d_c
True
Then we combine the two datasets and calculate the difference:
all_df = donation
all_df['received'] = receiving['received']
all_df['d_minus_r'] = all_df['donation'] - all_df['received']
all_df.head()
year | country | donation | received | d_minus_r | |
---|---|---|---|---|---|
0 | 1973 | Australia | 0.046286 | 0.000000 | 0.046286 |
1 | 1973 | Austria | 0.000000 | 0.000000 | 0.000000 |
2 | 1973 | Belgium | 0.039251 | 0.000000 | 0.039251 |
3 | 1973 | Brazil | 0.000000 | 0.312075 | -0.312075 |
4 | 1973 | Canada | 0.437928 | 0.000000 | 0.437928 |
all_df.shape
(1927, 5)
To make the plot, we need to transform the format of year:
all_df['year'] = pd.to_datetime(all_df['year'], format='%Y')
alt.Chart(all_df).mark_line().encode(
x = alt.X(
'year',
title = 'Year'
),
y = alt.Y(
'd_minus_r:Q',
title = 'Donation - Received',
axis=alt.Axis(
titleColor='#5276A7',
labelExpr = 'datum.value + "B `$"'
),
),
facet = alt.Facet(
"country:N",
columns = 8,
title = 'Country/Region',
header = alt.Header(labelFontSize = 12)
)
).properties(
width = 120,
height = 110,
).configure_header(
titleColor='#5276A7',
titleFontSize=16,
# labelColor='red',
labelFontSize=14
)
The problem with this plot is that with it, we are not able to see the actual donation and receiving. We can achieve this in the following way.
all_df_melted = pd.melt(all_df,
id_vars=['year', 'country'],
value_vars=['donation', 'received'],
var_name = 'category',
value_name = 'amount'
)
all_df_melted.replace({
'donation': 'Donation',
'received': 'Received'
}, inplace = True)
all_df_melted.rename(
columns = {
'category': 'Category'
}, inplace = True)
all_df_melted.head()
year | country | Category | amount | |
---|---|---|---|---|
0 | 1973-01-01 | Australia | Donation | 0.046286 |
1 | 1973-01-01 | Austria | Donation | 0.000000 |
2 | 1973-01-01 | Belgium | Donation | 0.039251 |
3 | 1973-01-01 | Brazil | Donation | 0.000000 |
4 | 1973-01-01 | Canada | Donation | 0.437928 |
This is important: because we are going to plot both donaiton and received amount each year for each country, it is important to replace 0 with NaN. Because the figures will be very small, if we do not change 0 to NaN, then we will consider 0 as small amount of donation or received amount, which is misleading.
all_df_melted['amount'] = all_df_melted['amount'].replace(0, np.nan)
all_df_melted.head()
year | country | Category | amount | |
---|---|---|---|---|
0 | 1973-01-01 | Australia | Donation | 0.046286 |
1 | 1973-01-01 | Austria | Donation | NaN |
2 | 1973-01-01 | Belgium | Donation | 0.039251 |
3 | 1973-01-01 | Brazil | Donation | NaN |
4 | 1973-01-01 | Canada | Donation | 0.437928 |
alt.Chart(all_df_melted).mark_line().encode(
x = alt.X(
'year',
title = 'Year'
),
y = alt.Y(
'amount:Q',
title = 'Amount',
axis=alt.Axis(
titleColor='#5276A7',
labelExpr = 'datum.value + "B $`"'
),
),
color = alt.Color(
'Category:N',
),
facet = alt.Facet(
"country:N",
columns = 8,
title = 'Country/Region',
header = alt.Header(labelFontSize = 12)
)
).properties(
width = 120,
height = 110,
).configure_header(
titleColor='#5276A7',
titleFontSize=16,
# labelColor='red',
labelFontSize=14
)
Task 2 #
The second task is: find the top 10 coalesced purposes (according to the amount of donation) and for these ten purposes, show how their amount changes by time.
# First, find the top ten purposes
top_purpose_df = df.groupby('purpose_name')['amount'].agg('sum').nlargest(n=10)
top_10_purposes = list(top_purpose_df.index)
top_10_purposes
['Air transport',
'Rail transport',
'Industrial development',
'Power generation/non-renewable sources',
'RESCHEDULING AND REFINANCING',
'Import support (capital goods)',
'Social/ welfare services',
'Telecommunications',
'Power generation/renewable sources',
'Sectors not specified']
# Then select only data involving these top purposes
# We only want three columns: year, amount, and purpose name
purpose_df = df[df.purpose_name.isin(top_10_purposes)][['year', 'amount', 'purpose_name']]
# So many rows:
purpose_df.shape
(17793, 3)
purpose_df.head()
year | amount | purpose_name | |
---|---|---|---|
0 | 1977 | 348718518.0 | Power generation/renewable sources |
2 | 1983 | 79371799.0 | Rail transport |
3 | 1984 | 212202942.0 | Rail transport |
11 | 1976 | 193588873.0 | Power generation/renewable sources |
19 | 1986 | 53561256.0 | Power generation/renewable sources |
# For each purpose, we calculate the yearly sum:
purpose_dfs = []
for group in purpose_df.groupby('purpose_name'):
purpose_name = group[0]
group_df = group[1].groupby('year')['amount'].sum().to_frame()
group_df.reset_index(inplace=True)
group_df['purpose_name'] = group[0]
purpose_dfs.append(group_df)
purpose = pd.concat(purpose_dfs, ignore_index = True)
purpose.head()
year | amount | purpose_name | |
---|---|---|---|
0 | 1974 | 1.525672e+07 | Air transport |
1 | 1975 | 1.582943e+07 | Air transport |
2 | 1977 | 7.046280e+05 | Air transport |
3 | 1979 | 7.342023e+07 | Air transport |
4 | 1980 | 3.088275e+09 | Air transport |
purpose.shape
(338, 3)
# change amount to the unit of billion
purpose['amount'] = purpose['amount'] / 10**9
# # change the format to be year
purpose['year'] = pd.to_datetime(purpose['year'], format='%Y')
max(purpose.amount)
8.344866729
# to change the purpose name
purpose_renamer_dic = {'Air transport': 'Air Transport',
'Rail transport': 'Rail Transport',
'Industrial development': 'Industrial Development',
'Power generation/non-renewable sources': 'Power: Non-renewable',
'RESCHEDULING AND REFINANCING': 'Rescheduling & Refinancing',
'Import support (capital goods)': 'Import Support',
'Social/ welfare services': 'Welfare Services',
'Telecommunications': 'Telecommunications',
'Power generation/renewable sources': 'Power: Renewable',
'Sectors not specified': 'Not Specified'
}
purpose.replace(purpose_renamer_dic, inplace = True)
# make the plot
alt.Chart(purpose).mark_line().encode(
x = 'year',
y = 'amount',
color = 'purpose_name',
strokeDash = 'purpose_name'
)
The problem with this chart is that it is so messy. We can visualize it in another way:
alt.Chart(purpose).mark_rect().encode(
x = alt.X(
'year(year)',
title = 'Year'
),
y = alt.Y(
'purpose_name:N',
title = 'Coalesced Purposes of Donation',
sort=alt.EncodingSortField(field='amount', op='sum', order='descending')
),
color = alt.Color(
'amount:Q',
title = 'Amount (in Billion $`)',
)
).properties(
width = 720,
height = 300,
title = 'Top Ten Coalesced Purposes by Total Donation Amount (1973-2013)',
).configure_axis(
labelFontSize=11,
titleFontSize=12
)
Last modified on 2022-10-16