Country Data Analysis icon Country Data Analysis
University Project #Data Science#Mathematics

Country Data Analysis#

Performing preliminary data cleaning along with a mathematical analysis on country data.

.rmd knitted with execution results available for download here

# grab the data
full_data = read.csv(url("https://raw.githubusercontent.com/bnokoro/Data-Science/master/countries%20of%20the%20world.csv"))
# all column names
colnames(full_data)
## [1] "Country" "Region"
## [3] "Population" "Area..sq..mi.."
## [5] "Pop..Density..per.sq..mi.." "Coastline..coast.area.ratio."
## [7] "Net.migration" "Infant.mortality..per.1000.births."
## [9] "GDP....per.capita." "Literacy...."
## [11] "Phones..per.1000." "Arable...."
## [13] "Crops...." "Other...."
## [15] "Climate" "Birthrate"
## [17] "Deathrate" "Agriculture"
## [19] "Industry" "Service"
# make column names more usable
c_names = c("country", "region", "pop", "area", "pop_density", "coastline_ratio", "migration", "infant_mortality", "gdp", "literacy", "phones", "arable", "crops", "other", "climate", "birthrate", "deathrate", "agri", "industry", "service")
colnames(full_data) = c_names
# many columns are just decimals as strings with a ',' instead of a '.'
# if we don't fix this then there are only a few usable columns for PC calculation
data = as.data.frame(lapply(full_data, function(y) as.numeric(gsub(",", ".", y))))
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
# replace NA columns for country and region
data$country = full_data$country
data$region = full_data$region
# there are some rows with NA values still, which we don't want, so remove them
data = data[complete.cases(data),]
# view the head
head(data)
## country region pop area
## 1 Afghanistan ASIA (EX. NEAR EAST) 31056997 647500
## 2 Albania EASTERN EUROPE 3581655 28748
## 3 Algeria NORTHERN AFRICA 32930091 2381740
## 7 Anguilla LATIN AMER. & CARIB 13477 102
## 8 Antigua & Barbuda LATIN AMER. & CARIB 69108 443
## 9 Argentina LATIN AMER. & CARIB 39921833 2766890
## pop_density coastline_ratio migration infant_mortality gdp literacy phones
## 1 48.0 0.00 23.06 163.07 700 36.0 3.2
## 2 124.6 1.26 -4.93 21.52 4500 86.5 71.2
## 3 13.8 0.04 -0.39 31.00 6000 70.0 78.1
## 7 132.1 59.80 10.76 21.03 8600 95.0 460.0
## 8 156.0 34.54 -6.15 19.46 11000 89.0 549.9
## 9 14.4 0.18 0.61 15.18 11200 97.1 220.4
## arable crops other climate birthrate deathrate agri industry service
## 1 12.13 0.22 87.65 1 46.60 20.34 0.380 0.240 0.380
## 2 21.09 4.42 74.49 3 15.11 5.22 0.232 0.188 0.579
## 3 3.22 0.25 96.53 1 17.14 4.61 0.101 0.600 0.298
## 7 0.00 0.00 100.00 2 14.17 5.34 0.040 0.180 0.780
## 8 18.18 4.55 77.27 2 16.93 5.37 0.038 0.220 0.743
## 9 12.31 0.48 87.21 3 16.73 7.55 0.095 0.358 0.547
# number of countries (incidentally, this is also the number of rows)
length(unique(data$country))
## [1] 179

Part 1: By ‘Hand’ Computation#

Firstly we need to center the data.

This is done for the quantitative columns by taking the mean value of the entire column and subtracting that from the entire column.

# firstly, center the data
# we want to center all the quantitative data, so all except the 'country' and 'region' columns
to_center = c("pop", "area", "pop_density", "coastline_ratio", "migration", "infant_mortality", "gdp", "literacy", "phones", "arable", "crops", "other", "climate", "birthrate", "deathrate", "agri", "industry", "service")
# loop through list of columns to center
for (c in to_center) {
# create new column name
c_name = sprintf("%s.c", c)
# calculate column mean
c_mean = mean(data[[c]])
# create new column of name 'c_name', that is the original column centered on '0'
data[[c_name]] = data[[c]] - c_mean
}
head(data)
## country region pop area
## 1 Afghanistan ASIA (EX. NEAR EAST) 31056997 647500
## 2 Albania EASTERN EUROPE 3581655 28748
## 3 Algeria NORTHERN AFRICA 32930091 2381740
## 7 Anguilla LATIN AMER. & CARIB 13477 102
## 8 Antigua & Barbuda LATIN AMER. & CARIB 69108 443
## 9 Argentina LATIN AMER. & CARIB 39921833 2766890
## pop_density coastline_ratio migration infant_mortality gdp literacy phones
## 1 48.0 0.00 23.06 163.07 700 36.0 3.2
## 2 124.6 1.26 -4.93 21.52 4500 86.5 71.2
## 3 13.8 0.04 -0.39 31.00 6000 70.0 78.1
## 7 132.1 59.80 10.76 21.03 8600 95.0 460.0
## 8 156.0 34.54 -6.15 19.46 11000 89.0 549.9
## 9 14.4 0.18 0.61 15.18 11200 97.1 220.4
## arable crops other climate birthrate deathrate agri industry service
## 1 12.13 0.22 87.65 1 46.60 20.34 0.380 0.240 0.380
## 2 21.09 4.42 74.49 3 15.11 5.22 0.232 0.188 0.579
## 3 3.22 0.25 96.53 1 17.14 4.61 0.101 0.600 0.298
## 7 0.00 0.00 100.00 2 14.17 5.34 0.040 0.180 0.780
## 8 18.18 4.55 77.27 2 16.93 5.37 0.038 0.220 0.743
## 9 12.31 0.48 87.21 3 16.73 7.55 0.095 0.358 0.547
## pop.c area.c pop_density.c coastline_ratio.c migration.c
## 1 -3157153 83317 -246.805 -16.4952 23.2665363
## 2 -30632495 -535435 -170.205 -15.2352 -4.7234637
## 3 -1284059 1817557 -281.005 -16.4552 -0.1834637
## 7 -34200673 -564081 -162.705 43.3048 10.9665363
## 8 -34145042 -563740 -138.805 18.0448 -5.9434637
## 9 5707683 2202707 -280.405 -16.3152 0.8165363
## infant_mortality.c gdp.c literacy.c phones.c arable.c crops.c
## 1 124.171844 -8425.6983 -45.944134 -204.95196 -1.870447 -4.22083799
## 2 -17.378156 -4625.6983 4.555866 -136.95196 7.089553 -0.02083799
## 3 -7.898156 -3125.6983 -11.944134 -130.05196 -10.780447 -4.19083799
## 7 -17.868156 -525.6983 13.055866 251.84804 -14.000447 -4.44083799
## 8 -19.438156 1874.3017 7.055866 341.74804 4.179553 0.10916201
## 9 -23.718156 2074.3017 15.155866 12.24804 -1.690447 -3.96083799
## other.c climate.c birthrate.c deathrate.c agri.c industry.c
## 1 6.091788 -1.1089385 23.532514 10.87486 0.22309497 -0.04802793
## 2 -7.068212 0.8910615 -7.957486 -4.24514 0.07509497 -0.10002793
## 3 14.971788 -1.1089385 -5.927486 -4.85514 -0.05590503 0.31197207
## 7 18.441788 -0.1089385 -8.897486 -4.12514 -0.11690503 -0.10802793
## 8 -4.288212 -0.1089385 -6.137486 -4.09514 -0.11890503 -0.06802793
## 9 5.651788 0.8910615 -6.337486 -1.91514 -0.06190503 0.06997207
## service.c
## 1 -0.17450838
## 2 0.02449162
## 3 -0.25650838
## 7 0.22549162
## 8 0.18849162
## 9 -0.00750838

Once all the columns are centered, we need to compute the covariance matrix.

In the notes, the covariance between any two random variables is a linear combination of the two variable columns, times the reciprocal of their dimension (which is the same between both samples).

Since we want to compute the covariance matrix, which is all possible covariances between all variables, we can get this by simply multiplying the transpose of our centered data matrix by itself.

# now compute the covariance matrix
# grab all the centered data
c_data = as.matrix(data[, to_center])
# covariance matrix is (1/dim) t(X) * X
mat_cov = (1/dim(c_data)[1]) * t(c_data) %*% c_data
# mat_cov is an 18 x 18 matrix holding all possible covariances between all variables
as.data.frame(head(mat_cov))
## pop area pop_density coastline_ratio
## pop 1.843534e+16 1.310091e+14 6.650672e+09 34955497.1742
## area 1.310091e+14 2.255279e+12 3.421393e+07 254660.3702
## pop_density 6.650672e+09 3.421393e+07 1.978894e+06 21507.7640
## coastline_ratio 3.495550e+07 2.546604e+05 2.150776e+04 5714.2032
## migration -6.341238e+05 2.311779e+05 9.929426e+02 -87.8610
## infant_mortality 1.342162e+09 2.208914e+07 4.522641e+03 366.0719
## migration infant_mortality gdp literacy
## pop -6.341238e+05 1.342162e+09 2.697464e+11 2.704456e+09
## area 2.311779e+05 2.208914e+07 6.063487e+09 4.625027e+07
## pop_density 9.929426e+02 4.522641e+03 5.205300e+06 2.650005e+04
## coastline_ratio -8.786100e+01 3.660719e+02 1.759397e+05 1.497042e+03
## migration 2.249060e+01 -5.853604e+00 1.537492e+04 -2.196576e+01
## infant_mortality -5.853604e+00 2.755919e+03 1.382933e+05 2.656599e+03
## phones arable crops other
## pop 7.013224e+09 8.351684e+08 8.561517e+07 2.500641e+09
## area 1.401425e+08 6.917724e+06 7.040618e+05 4.879692e+07
## pop_density 1.066343e+05 3.058999e+03 9.032472e+02 2.551825e+04
## coastline_ratio 4.954010e+03 1.556647e+02 3.109345e+02 1.182920e+03
## migration 1.877714e+02 -6.983343e+00 -1.641154e+01 2.741058e+00
## infant_mortality 2.931789e+03 4.877035e+02 1.455181e+02 3.256592e+03
## climate birthrate deathrate agri
## pop 7.046718e+07 6.935192e+08 2.893140e+08 5.221609e+06
## area 1.097993e+06 1.242727e+07 5.164603e+06 8.494503e+04
## pop_density 6.098891e+02 4.097778e+03 1.856882e+03 1.629816e+01
## coastline_ratio 3.339849e+01 3.278061e+02 9.917737e+01 2.228266e+00
## migration -6.676536e-01 -6.636186e+00 -9.012095e-01 -1.014925e-01
## infant_mortality 7.304123e+01 1.239376e+03 4.901135e+02 1.013914e+01
## industry service
## pop 1.155461e+07 1.744558e+07
## area 1.826015e+05 2.967024e+05
## pop_density 5.693481e+01 2.215264e+02
## coastline_ratio 2.800546e+00 1.146237e+01
## migration -6.240659e-02 -4.290771e-02
## infant_mortality 1.078295e+01 1.796849e+01

Now compute eigenvalues of the covariance matrix

eig = eigen(mat_cov)
# order by decreasing eigenvalues
ord = order(eig$values, decreasing = TRUE)
eig$values = eig$values[ord]
eig$vectors = eig$vectors[, ord]
as.data.frame(eig$values)
## eig$values
## 1 1.843627e+16
## 2 1.324220e+12
## 3 1.590790e+08
## 4 1.807700e+06
## 5 1.146166e+04
## 6 7.545205e+03
## 7 4.985179e+03
## 8 7.157794e+02
## 9 2.996424e+02
## 10 6.139323e+01
## 11 5.232119e+01
## 12 1.636866e+01
## 13 1.103887e+01
## 14 1.005573e+01
## 15 1.995379e-01
## 16 1.125209e-01
## 17 1.825861e-02
## 18 3.616292e-05
as.data.frame(eig$vectors)
## V1 V2 V3 V4 V5
## 1 9.999747e-01 7.106755e-03 7.604406e-06 6.802394e-07 -6.233354e-08
## 2 7.106744e-03 -9.999698e-01 -3.130424e-03 -1.124480e-04 -8.987999e-06
## 3 3.607426e-07 9.843912e-06 3.276378e-02 -9.994192e-01 -2.709532e-03
## 4 1.896068e-09 -5.122782e-09 1.106982e-03 -8.730955e-03 2.520540e-01
## 5 -3.430546e-11 -1.780109e-07 9.222278e-05 -2.843458e-04 -1.681466e-02
## 6 7.280676e-08 -9.477645e-06 4.996314e-04 -8.698326e-04 1.633475e-01
## 7 1.463326e-05 -3.131528e-03 9.992240e-01 3.276006e-02 -2.040249e-02
## 8 1.467063e-07 -2.041324e-05 4.549534e-03 -1.159722e-03 3.273766e-01
## 9 3.804481e-07 -6.819751e-05 2.072662e-02 1.676540e-03 8.374077e-01
## 10 4.530182e-08 -7.420190e-07 7.442480e-04 6.143764e-04 6.261708e-02
## 11 4.643999e-09 -7.224704e-08 1.437989e-04 -7.031651e-05 3.603481e-02
## 12 1.356525e-07 -2.342991e-05 3.906615e-03 -2.534051e-03 2.925687e-01
## 13 3.822531e-09 -4.510087e-07 1.180019e-04 1.344362e-05 8.311393e-03
## 14 3.762095e-08 -5.662700e-06 6.642757e-04 -2.581299e-04 9.274100e-02
## 15 1.569425e-08 -2.347492e-06 3.776934e-04 1.014224e-04 3.406494e-02
## 16 2.832504e-10 -3.612348e-08 1.953432e-06 -2.636424e-06 7.650543e-04
## 17 6.267870e-10 -7.588511e-08 1.376745e-05 9.933738e-06 8.129917e-04
## 18 9.463547e-10 -1.304394e-07 3.220141e-05 -2.724061e-05 2.332382e-03
## V6 V7 V8 V9 V10
## 1 -5.822566e-09 1.139797e-08 9.773677e-09 -4.594780e-08 -1.361968e-08
## 2 1.192345e-05 5.353165e-06 -1.466498e-06 3.854116e-06 3.925471e-07
## 3 3.187000e-03 -8.235817e-03 3.583135e-04 1.396374e-03 -4.346139e-04
## 4 -2.976292e-02 9.653702e-01 2.499145e-02 -1.241493e-02 5.035454e-02
## 5 3.320519e-03 -1.261809e-02 4.995595e-02 -3.564852e-02 8.276962e-02
## 6 -4.839443e-01 -6.873429e-02 7.291064e-01 2.192444e-01 -1.382742e-01
## 7 -5.723366e-03 4.322522e-03 2.208840e-04 7.060479e-04 -2.757826e-04
## 8 -4.116161e-01 -7.334592e-02 -6.326789e-01 3.329605e-01 -1.862596e-01
## 9 4.881175e-01 -2.067870e-01 1.268556e-01 -3.188241e-02 -1.074262e-02
## 10 -5.741838e-02 -3.318512e-02 -2.398614e-02 6.755562e-01 5.762565e-01
## 11 -2.703785e-02 3.708767e-02 -7.214306e-02 1.711070e-01 -6.966457e-01
## 12 -5.450605e-01 -1.076902e-01 -1.225318e-01 -5.863444e-01 2.901616e-01
## 13 -1.076271e-02 -2.818218e-03 -7.367475e-03 1.787050e-02 1.206803e-02
## 14 -2.225382e-01 -2.371476e-02 1.467802e-01 5.007817e-02 -1.788125e-01
## 15 -8.000099e-02 -1.746494e-02 8.065051e-02 7.963148e-02 5.460128e-02
## 16 -1.809252e-03 -1.677573e-04 1.846775e-03 1.567684e-03 -8.732642e-04
## 17 -1.981793e-03 -4.678067e-04 -1.909807e-03 1.264201e-04 1.300531e-03
## 18 -2.498008e-03 -4.011452e-04 -2.090376e-03 9.354233e-04 1.269822e-03
## V11 V12 V13 V14 V15
## 1 4.417525e-09 0.000000e+00 3.769662e-09 4.722026e-09 0.000000e+00
## 2 -4.174736e-07 -2.978555e-08 -3.049280e-07 -4.778417e-08 3.542790e-08
## 3 -1.552183e-04 -2.677304e-04 3.630014e-04 2.244688e-04 2.226249e-05
## 4 9.831278e-03 1.072821e-02 -1.290419e-03 6.143173e-03 5.272939e-04
## 5 3.515466e-02 7.690653e-01 -5.934234e-01 2.095751e-01 1.216091e-02
## 6 3.165543e-01 2.561157e-02 9.350098e-03 -1.860822e-01 4.207467e-03
## 7 4.240393e-06 -2.014810e-04 1.997676e-04 -2.000269e-04 -7.823850e-06
## 8 3.474753e-01 1.645285e-01 1.448907e-01 5.771106e-02 -2.806471e-03
## 9 -3.490479e-03 5.479346e-03 2.982210e-03 4.495995e-03 -1.786292e-04
## 10 -3.668537e-01 -1.047262e-01 -1.771975e-01 -1.576114e-01 -2.537776e-02
## 11 -3.838293e-01 -2.246403e-01 -4.947221e-01 -1.842324e-01 -1.610512e-02
## 12 -2.534338e-01 -1.679909e-01 -2.255129e-01 -1.383069e-01 -1.424140e-02
## 13 -7.130573e-03 -3.612973e-02 -2.600230e-02 1.738677e-02 8.451081e-01
## 14 -6.360486e-01 2.920380e-01 4.697031e-01 4.178997e-01 1.252985e-02
## 15 1.640327e-01 -4.518130e-01 -2.799829e-01 8.155786e-01 -3.992294e-02
## 16 -1.912146e-03 8.862908e-03 1.446416e-03 -6.165337e-03 -3.676682e-01
## 17 -6.901997e-04 -1.450381e-02 -7.164754e-03 -3.537006e-04 3.830668e-01
## 18 -7.408451e-03 6.915870e-04 -3.347303e-03 1.506317e-03 -2.900742e-02
## V16 V17 V18
## 1 1.293053e-10 0.000000e+00 0.000000e+00
## 2 2.381118e-08 1.041639e-08 3.080153e-10
## 3 3.254493e-06 -2.093791e-05 -6.453329e-07
## 4 -5.369443e-05 -2.544100e-04 -5.235376e-06
## 5 -2.271815e-03 -2.516954e-03 -6.349697e-07
## 6 -9.051947e-04 3.317874e-03 1.189014e-04
## 7 4.259468e-06 5.630168e-06 1.001056e-07
## 8 -4.078064e-03 -2.268992e-04 1.914937e-05
## 9 -6.468753e-04 -6.118164e-04 -2.073485e-05
## 10 -1.366930e-02 -1.187529e-03 -5.930359e-03
## 11 -4.829122e-03 -2.399308e-04 -5.908173e-03
## 12 -3.463189e-03 -1.345553e-03 -5.914693e-03
## 13 5.315110e-01 -1.547903e-02 1.417060e-03
## 14 -1.108370e-03 -4.838249e-03 3.932646e-06
## 15 -1.049969e-02 -1.565647e-03 -1.868645e-04
## 16 5.697981e-01 -4.847823e-01 5.522712e-01
## 17 -6.228680e-01 -3.712942e-01 5.719821e-01
## 18 6.711889e-02 7.917324e-01 6.064041e-01

Now compute the singular values

From there, the singular values are just the square root of the eigenvalues

eig$sing = sqrt(eig$values)
as.data.frame(eig$sing)
## eig$sing
## 1 1.357802e+08
## 2 1.150748e+06
## 3 1.261265e+04
## 4 1.344507e+03
## 5 1.070591e+02
## 6 8.686314e+01
## 7 7.060580e+01
## 8 2.675405e+01
## 9 1.731018e+01
## 10 7.835383e+00
## 11 7.233339e+00
## 12 4.045820e+00
## 13 3.322479e+00
## 14 3.171077e+00
## 15 4.466967e-01
## 16 3.354413e-01
## 17 1.351244e-01
## 18 6.013561e-03

Construct the basis

Now we need to use our eigenvectors to construct a change of basis based on the highest singular value.

Specifically, we construct a change of basis into a basis starting with the eigenvector associated with the highest singular value.

Though, we can’t just use the eigenvectors as they are, we want our new basis to be orthonormal.

Having a normalized basis just makes things simpler, but there is a good reason for the basis to be orthogonal as well.

By having the basis vectors orthogonal to each other, we are effectively constructing a basis from our eigenvectors only using the parts of the eigenvectors that are uncorrelated from each other.

This leaves us with a basis that will describe our data only in terms of the parts that are uncorrelated from each other.

To do this, we need to preform the Gram-Schmidt process on our matrix of eigenvectors.

if (!require("pracma")) {
install.packages("pracma")
library(pracma)
}
## Loading required package: pracma
gs = gramSchmidt(A=eig$vectors)
as.data.frame(gs$Q)
## V1 V2 V3 V4 V5
## 1 9.999747e-01 7.106755e-03 7.604406e-06 6.802394e-07 -6.233354e-08
## 2 7.106744e-03 -9.999698e-01 -3.130424e-03 -1.124480e-04 -8.987999e-06
## 3 3.607426e-07 9.843912e-06 3.276378e-02 -9.994192e-01 -2.709532e-03
## 4 1.896068e-09 -5.122782e-09 1.106982e-03 -8.730955e-03 2.520540e-01
## 5 -3.430546e-11 -1.780109e-07 9.222278e-05 -2.843458e-04 -1.681466e-02
## 6 7.280676e-08 -9.477645e-06 4.996314e-04 -8.698326e-04 1.633475e-01
## 7 1.463326e-05 -3.131528e-03 9.992240e-01 3.276006e-02 -2.040249e-02
## 8 1.467063e-07 -2.041324e-05 4.549534e-03 -1.159722e-03 3.273766e-01
## 9 3.804481e-07 -6.819751e-05 2.072662e-02 1.676540e-03 8.374077e-01
## 10 4.530182e-08 -7.420190e-07 7.442480e-04 6.143764e-04 6.261708e-02
## 11 4.643999e-09 -7.224704e-08 1.437989e-04 -7.031651e-05 3.603481e-02
## 12 1.356525e-07 -2.342991e-05 3.906615e-03 -2.534051e-03 2.925687e-01
## 13 3.822531e-09 -4.510087e-07 1.180019e-04 1.344362e-05 8.311393e-03
## 14 3.762095e-08 -5.662700e-06 6.642757e-04 -2.581299e-04 9.274100e-02
## 15 1.569425e-08 -2.347492e-06 3.776934e-04 1.014224e-04 3.406494e-02
## 16 2.832504e-10 -3.612348e-08 1.953432e-06 -2.636424e-06 7.650543e-04
## 17 6.267870e-10 -7.588511e-08 1.376745e-05 9.933738e-06 8.129917e-04
## 18 9.463547e-10 -1.304394e-07 3.220141e-05 -2.724061e-05 2.332382e-03
## V6 V7 V8 V9 V10
## 1 -5.822566e-09 1.139797e-08 9.773677e-09 -4.594780e-08 -1.361968e-08
## 2 1.192345e-05 5.353165e-06 -1.466498e-06 3.854116e-06 3.925471e-07
## 3 3.187000e-03 -8.235817e-03 3.583135e-04 1.396374e-03 -4.346139e-04
## 4 -2.976292e-02 9.653702e-01 2.499145e-02 -1.241493e-02 5.035454e-02
## 5 3.320519e-03 -1.261809e-02 4.995595e-02 -3.564852e-02 8.276962e-02
## 6 -4.839443e-01 -6.873429e-02 7.291064e-01 2.192444e-01 -1.382742e-01
## 7 -5.723366e-03 4.322522e-03 2.208840e-04 7.060479e-04 -2.757826e-04
## 8 -4.116161e-01 -7.334592e-02 -6.326789e-01 3.329605e-01 -1.862596e-01
## 9 4.881175e-01 -2.067870e-01 1.268556e-01 -3.188241e-02 -1.074262e-02
## 10 -5.741838e-02 -3.318512e-02 -2.398614e-02 6.755562e-01 5.762565e-01
## 11 -2.703785e-02 3.708767e-02 -7.214306e-02 1.711070e-01 -6.966457e-01
## 12 -5.450605e-01 -1.076902e-01 -1.225318e-01 -5.863444e-01 2.901616e-01
## 13 -1.076271e-02 -2.818218e-03 -7.367475e-03 1.787050e-02 1.206803e-02
## 14 -2.225382e-01 -2.371476e-02 1.467802e-01 5.007817e-02 -1.788125e-01
## 15 -8.000099e-02 -1.746494e-02 8.065051e-02 7.963148e-02 5.460128e-02
## 16 -1.809252e-03 -1.677573e-04 1.846775e-03 1.567684e-03 -8.732642e-04
## 17 -1.981793e-03 -4.678067e-04 -1.909807e-03 1.264201e-04 1.300531e-03
## 18 -2.498008e-03 -4.011452e-04 -2.090376e-03 9.354233e-04 1.269822e-03
## V11 V12 V13 V14 V15
## 1 4.417525e-09 -1.227823e-23 3.769662e-09 4.722026e-09 5.384599e-25
## 2 -4.174736e-07 -2.978555e-08 -3.049280e-07 -4.778417e-08 3.542790e-08
## 3 -1.552183e-04 -2.677304e-04 3.630014e-04 2.244688e-04 2.226249e-05
## 4 9.831278e-03 1.072821e-02 -1.290419e-03 6.143173e-03 5.272939e-04
## 5 3.515466e-02 7.690653e-01 -5.934234e-01 2.095751e-01 1.216091e-02
## 6 3.165543e-01 2.561157e-02 9.350098e-03 -1.860822e-01 4.207467e-03
## 7 4.240393e-06 -2.014810e-04 1.997676e-04 -2.000269e-04 -7.823850e-06
## 8 3.474753e-01 1.645285e-01 1.448907e-01 5.771106e-02 -2.806471e-03
## 9 -3.490479e-03 5.479346e-03 2.982210e-03 4.495995e-03 -1.786292e-04
## 10 -3.668537e-01 -1.047262e-01 -1.771975e-01 -1.576114e-01 -2.537776e-02
## 11 -3.838293e-01 -2.246403e-01 -4.947221e-01 -1.842324e-01 -1.610512e-02
## 12 -2.534338e-01 -1.679909e-01 -2.255129e-01 -1.383069e-01 -1.424140e-02
## 13 -7.130573e-03 -3.612973e-02 -2.600230e-02 1.738677e-02 8.451081e-01
## 14 -6.360486e-01 2.920380e-01 4.697031e-01 4.178997e-01 1.252985e-02
## 15 1.640327e-01 -4.518130e-01 -2.799829e-01 8.155786e-01 -3.992294e-02
## 16 -1.912146e-03 8.862908e-03 1.446416e-03 -6.165337e-03 -3.676682e-01
## 17 -6.901997e-04 -1.450381e-02 -7.164754e-03 -3.537006e-04 3.830668e-01
## 18 -7.408451e-03 6.915870e-04 -3.347303e-03 1.506317e-03 -2.900742e-02
## V16 V17 V18
## 1 1.293053e-10 5.124691e-27 -3.130223e-25
## 2 2.381118e-08 1.041639e-08 3.080153e-10
## 3 3.254493e-06 -2.093791e-05 -6.453329e-07
## 4 -5.369443e-05 -2.544100e-04 -5.235376e-06
## 5 -2.271815e-03 -2.516954e-03 -6.349697e-07
## 6 -9.051947e-04 3.317874e-03 1.189014e-04
## 7 4.259468e-06 5.630168e-06 1.001056e-07
## 8 -4.078064e-03 -2.268992e-04 1.914937e-05
## 9 -6.468753e-04 -6.118164e-04 -2.073485e-05
## 10 -1.366930e-02 -1.187529e-03 -5.930359e-03
## 11 -4.829122e-03 -2.399308e-04 -5.908173e-03
## 12 -3.463189e-03 -1.345553e-03 -5.914693e-03
## 13 5.315110e-01 -1.547903e-02 1.417060e-03
## 14 -1.108370e-03 -4.838249e-03 3.932646e-06
## 15 -1.049969e-02 -1.565647e-03 -1.868645e-04
## 16 5.697981e-01 -4.847823e-01 5.522712e-01
## 17 -6.228680e-01 -3.712942e-01 5.719821e-01
## 18 6.711889e-02 7.917324e-01 6.064041e-01
# check for othonormality
# sum all columns together and ensure they're all 1
colSums(gs$Q[,1:dim(gs$Q)[1]]^2)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# so all the columns of Q are normalized
# now check if the columns are orthogonal
round(t(gs$Q[,1]) %*% gs$Q[,2], 15)
round(t(gs$Q[,7]) %*% gs$Q[,3], 15)
round(t(gs$Q[,18]) %*% gs$Q[,10], 15)
## [,1]
## [1,] 0
## [,1]
## [1,] 0
## [,1]
## [1,] 0

Create a change of basis matrix

Now Q is an orthonormal basis of our data set.

So we need to find the change of basis matrix to convert our data into the new basis.

Our data set is currently in the standard basis, so we can use the identity matrix when creating our change of basis matrix via RREF.

# form an augmented matrix
aug = cbind(gs$Q, diag(dim(gs$Q)[1]))
# convert to RREF
aug_r = rref(aug)
# extract the second half of the matrix to get our change of basis matrix
# take the columns starting from numcolumns(gs$Q)+1 to the end
P = aug_r[, (dim(gs$Q)[1] + 1): dim(aug_r)[2]]

Alternatively, instead of using the RREF method above, we could just…

P2 = solve(gs$Q)
# there's some floating point imprecision, but measuring to the 13th digit is close enough for them to be equal.
as.data.frame(round(P,13) == round(P2,13))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
## 1 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 3 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 4 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 5 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 6 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 7 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 8 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 9 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 10 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 11 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 12 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 13 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 14 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 15 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 16 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 17 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 18 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## V16 V17 V18
## 1 TRUE TRUE TRUE
## 2 TRUE TRUE TRUE
## 3 TRUE TRUE TRUE
## 4 TRUE TRUE TRUE
## 5 TRUE TRUE TRUE
## 6 TRUE TRUE TRUE
## 7 TRUE TRUE TRUE
## 8 TRUE TRUE TRUE
## 9 TRUE TRUE TRUE
## 10 TRUE TRUE TRUE
## 11 TRUE TRUE TRUE
## 12 TRUE TRUE TRUE
## 13 TRUE TRUE TRUE
## 14 TRUE TRUE TRUE
## 15 TRUE TRUE TRUE
## 16 TRUE TRUE TRUE
## 17 TRUE TRUE TRUE
## 18 TRUE TRUE TRUE

Perform a change of basis

Now we have ‘P’, our change of basis matrix that will change the basis for the original data column vectors from the standard basis to our new, more useful basis.

So lets do that.

In general, for a change of basis, given basis ‘B’ and ‘C’, and change of basis matrix ‘P’ from basis ‘B’ to ‘C’

[X]C = P[X]B

Where [X]C is X in the C basis, and [X]B is X in the B basis.

So we need to multiply each column of our normalized data by ‘P’, or alternatively, we can just multiply the whole data matrix by ‘P’

new_data = as.data.frame(c_data %*% t(P))
dim(new_data)
## [1] 179 18
head(new_data)
## V1 V2 V3 V4 V5 V6
## 1 31060814.33 -4.267682e+05 -1089.046 -77.13338 49.982585 -148.89758
## 2 3581768.92 -3.307288e+03 4440.034 21.90764 24.105844 -82.10200
## 3 32946185.91 -2.147661e+06 -1207.301 -62.87719 -22.305241 -68.78763
## 7 13477.51 -3.318498e+01 8607.884 149.57199 289.585876 66.59977
## 8 69109.56 1.366026e+01 11007.893 204.76164 302.437483 111.35627
## 9 39940488.58 -2.483127e+06 2839.205 68.57118 -8.898148 -23.64994
## V7 V8 V9 V10 V11 V12 V13
## 1 -19.6479509 94.62839 9.255740 -2.7022998 11.7293902 16.017477 -12.7167070
## 2 -11.6644626 -36.52410 7.260611 6.7790857 -0.6294730 -3.315504 -0.3480189
## 3 4.5920168 -22.68162 -13.588736 7.5225056 -2.8718287 -2.574732 -4.2361991
## 7 -21.0599187 7.80183 -30.756739 2.7276239 5.5433329 10.690335 -6.8099246
## 8 -51.1470248 23.34857 -6.793551 -0.0286152 -2.5941324 -1.926062 4.0710499
## 9 -0.3345979 -31.53485 4.121647 4.4767660 0.6348825 1.015110 -0.3684653
## V14 V15 V16 V17 V18
## 1 -1.4296007 -0.11799942 -0.4378892 0.10490776 0.0039501257
## 2 -4.3919372 0.56329369 0.6228932 0.06541574 0.0014461630
## 3 -5.5792145 -0.44752896 -0.4881746 -0.17673665 -0.0064856007
## 7 1.0510467 0.09979664 -0.0866475 0.05903751 0.0037029916
## 8 -2.1545295 -0.30364551 -0.2924974 0.01351778 0.0009931669
## 9 0.8290899 0.73797209 0.4570740 -0.03369564 0.0012427317
if (!require("devtools")) {
install.packages("devtools")
library(devtools)
}
## Loading required package: devtools
## Loading required package: usethis
if (!require("vqv/ggbiplot")) {
install_github("vqv/ggbiplot")
library(ggbiplot)
}
## Loading required package: vqv/ggbiplot
# add the countries and regions back to the data
new_data = cbind(data$region, new_data)
new_data = cbind(data$country, new_data)
colnames(new_data) = c_names
head(new_data)
## country region pop
## 1 Afghanistan ASIA (EX. NEAR EAST) 31060814.33
## 2 Albania EASTERN EUROPE 3581768.92
## 3 Algeria NORTHERN AFRICA 32946185.91
## 7 Anguilla LATIN AMER. & CARIB 13477.51
## 8 Antigua & Barbuda LATIN AMER. & CARIB 69109.56
## 9 Argentina LATIN AMER. & CARIB 39940488.58
## area pop_density coastline_ratio migration infant_mortality
## 1 -4.267682e+05 -1089.046 -77.13338 49.982585 -148.89758
## 2 -3.307288e+03 4440.034 21.90764 24.105844 -82.10200
## 3 -2.147661e+06 -1207.301 -62.87719 -22.305241 -68.78763
## 7 -3.318498e+01 8607.884 149.57199 289.585876 66.59977
## 8 1.366026e+01 11007.893 204.76164 302.437483 111.35627
## 9 -2.483127e+06 2839.205 68.57118 -8.898148 -23.64994
## gdp literacy phones arable crops other climate
## 1 -19.6479509 94.62839 9.255740 -2.7022998 11.7293902 16.017477 -12.7167070
## 2 -11.6644626 -36.52410 7.260611 6.7790857 -0.6294730 -3.315504 -0.3480189
## 3 4.5920168 -22.68162 -13.588736 7.5225056 -2.8718287 -2.574732 -4.2361991
## 7 -21.0599187 7.80183 -30.756739 2.7276239 5.5433329 10.690335 -6.8099246
## 8 -51.1470248 23.34857 -6.793551 -0.0286152 -2.5941324 -1.926062 4.0710499
## 9 -0.3345979 -31.53485 4.121647 4.4767660 0.6348825 1.015110 -0.3684653
## birthrate deathrate agri industry service
## 1 -1.4296007 -0.11799942 -0.4378892 0.10490776 0.0039501257
## 2 -4.3919372 0.56329369 0.6228932 0.06541574 0.0014461630
## 3 -5.5792145 -0.44752896 -0.4881746 -0.17673665 -0.0064856007
## 7 1.0510467 0.09979664 -0.0866475 0.05903751 0.0037029916
## 8 -2.1545295 -0.30364551 -0.2924974 0.01351778 0.0009931669
## 9 0.8290899 0.73797209 0.4570740 -0.03369564 0.0012427317

Now here’s some plots of the data in the new basis, to be blunt, I really have no idea what conclusions to draw from these plots other than the fact that there is very little correlation between the variables

plot(new_data$pop, new_data$gdp,
xlim=c(min(new_data$pop)- 100, max(new_data$pop)+100),
ylim=c(min(new_data$gdp)- 100, max(new_data$gdp)+100))

plot 1

To contrast, here’s the same data in the standard basis.

c_data = as.data.frame(c_data)
plot(c_data$pop, c_data$gdp,
xlim=c(min(c_data$pop)- 100, max(c_data$pop)+100),
ylim=c(min(c_data$gdp)- 100, max(c_data$gdp) + 100))

plot 2

The data in the new basis is clearly more ‘flat’ than the original, and the original data has a weak negative correlation between ‘gdp’ and ‘pop’, whereas the new data has practically none.

Once more with other data…

plot(new_data$pop, new_data$area,
xlim=c(min(new_data$pop)- 100, max(new_data$pop)+100),
ylim=c(min(new_data$area)- 100, max(new_data$area)+100))

plot 3

plot(c_data$pop, c_data$area,
xlim=c(min(c_data$pop)- 100, max(c_data$pop)+100),
ylim=c(min(c_data$area)- 100, max(c_data$area)+100))

plot 4

Here it almost looks like both plots are a reflection over the X-Axis, or rather, a negative vertical stretch of each other. There are a few minute differences in the graphs, like how the new data has more points actually crossing the X-Axis, but both graphs look close enough to each other to be interesting.

Part 2: Using ‘prcomp’ Function#

pca = prcomp(data[, to_center], scale = TRUE, center = TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3573 1.5989 1.3783 1.27473 1.20171 0.97416 0.92213
## Proportion of Variance 0.3087 0.1420 0.1055 0.09027 0.08023 0.05272 0.04724
## Cumulative Proportion 0.3087 0.4507 0.5563 0.64655 0.72678 0.77950 0.82675
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.86063 0.74751 0.6841 0.61652 0.60213 0.53394 0.39998
## Proportion of Variance 0.04115 0.03104 0.0260 0.02112 0.02014 0.01584 0.00889
## Cumulative Proportion 0.86789 0.89894 0.9249 0.94605 0.96619 0.98203 0.99092
## PC15 PC16 PC17 PC18
## Standard deviation 0.31224 0.25553 0.02618 0.0003848
## Proportion of Variance 0.00542 0.00363 0.00004 0.0000000
## Cumulative Proportion 0.99633 0.99996 1.00000 1.0000000
ggbiplot(pca, groups = data$region, ellipse = TRUE)

plot 5

This is pretty messy, and not really useful, so lets try trimming some of the data out, then trying again

pca_data1 = as.data.frame(matrix(c(data$pop, data$area, data$pop_density, data$migration, data$gdp), nc=5, nr=length(data$pop)))
colnames(pca_data1) = c("pop", "area", "pop_density", "migration", "gdp")
pca2 = prcomp(pca_data1, scale = TRUE, center = TRUE)
summary(pca2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.2734 1.2256 0.9367 0.7901 0.61219
## Proportion of Variance 0.3243 0.3004 0.1755 0.1248 0.07496
## Cumulative Proportion 0.3243 0.6247 0.8002 0.9250 1.00000
ggbiplot(pca2, groups = data$region)

plot 6

This is more readable, especially without the ellipses.

This graph gives us the projection of our variables onto the plane created by the ‘PC1’ and ‘PC2’ vectors.

From this graph, we can easily see there’s a relation between the vectors:

  • pop and area
  • gdp and migration
  • and to a lesser extent, pop_density and gdp/migration
pca_data2 = as.data.frame(matrix(c(data$pop, data$gdp), nc=2, nr=length(data$pop)))
colnames(pca_data2) = c("pop", "gdp")
pca3 = prcomp(pca_data2, scale = TRUE, center = TRUE)
summary(pca3)
## Importance of components:
## PC1 PC2
## Standard deviation 1.0167 0.9830
## Proportion of Variance 0.5168 0.4832
## Cumulative Proportion 0.5168 1.0000
ggbiplot(pca3, groups = data$region, ellipse = TRUE)

plot 7 With a further subset of our data, we can see a similar result, the lack of correlation between pop and gdp, though with a rotation of ~45 degrees.

But with a further reduced data set, it’s easier to interpret the data points graphically, or rather, it’s much easier to interpret the data points as a linear combination of the vectors on the graph.

Some conclusions I’ve come to are:

  • Most Western Europe countries have a low population, but high GDP
  • There’s one outlier North American country that has the highest GDP, and a higher than average population
  • There’s two outlier Asian countries that have extremely high population.
  • Most countries are scattered close to the zero population vector, meaning there’s less variance in country population.
  • There’s much higher variance in GDP
← Back to Projects