Today’s data was on transit costs. Because I don’t know a lot about transit, we spent the time, just exploring the variables.
transit_cost <- readr::read_csv("transit_cost.csv") %>%
mutate(real_cost = as.numeric(real_cost), start_year = as.numeric(start_year)) %>%
filter(!is.na(line)) %>%
mutate(region = countrycode(country, origin = "ecb",
destination = "region")) %>%
mutate(region = case_when(country == "UK" ~ "Europe & Central Asia",
TRUE ~ region))
head(transit_cost)
## # A tibble: 6 x 21
## e country city line start_year end_year rr length tunnel_per tunnel
## <dbl> <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 7136 CA Vanc… Broa… 2020 2025 0 5.7 87.72% 5
## 2 7137 CA Toro… Vaug… 2009 2017 0 8.6 100.00% 8.6
## 3 7138 CA Toro… Scar… 2020 2030 0 7.8 100.00% 7.8
## 4 7139 CA Toro… Onta… 2020 2030 0 15.5 57.00% 8.8
## 5 7144 CA Toro… Yong… 2020 2030 0 7.4 100.00% 7.4
## 6 7145 NL Amst… Nort… 2003 2018 0 9.7 73.00% 7.1
## # … with 11 more variables: stations <dbl>, source1 <chr>, cost <dbl>,
## # currency <chr>, year <dbl>, ppp_rate <dbl>, real_cost <dbl>,
## # cost_km_millions <dbl>, source2 <chr>, reference <chr>, region <chr>
tail(transit_cost)
## # A tibble: 6 x 21
## e country city line start_year end_year rr length tunnel_per tunnel
## <dbl> <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 9507 TR Ista… M5 P… 2016 2022 0 17.8 100.00% 17.8
## 2 9508 TR Ista… M12 2017 2022 0 13 100.00% 13
## 3 9509 TR Ista… M11 … 2016 2021 0 37.5 100.00% 37.5
## 4 9510 TR Ista… M11 … 2019 2022 0 32 100.00% 32
## 5 9459 UZ Tash… Serg… 2017 2020 0 7.1 0.00% 7.1
## 6 9460 UZ Tash… Yunu… 2017 2020 0 2.9 100.00% 2.9
## # … with 11 more variables: stations <dbl>, source1 <chr>, cost <dbl>,
## # currency <chr>, year <dbl>, ppp_rate <dbl>, real_cost <dbl>,
## # cost_km_millions <dbl>, source2 <chr>, reference <chr>, region <chr>
transit_cost %>%
group_by(country) %>%
summarize(mean(real_cost))
## # A tibble: 56 x 2
## country `mean(real_cost)`
## <chr> <dbl>
## 1 AE 6637.
## 2 AR 4646
## 3 AT 1352
## 4 AU 6238.
## 5 BD 12352.
## 6 BE 1170
## 7 BG 1016.
## 8 BH 4882.
## 9 BR 3665
## 10 CA 3283.
## # … with 46 more rows
tapply(transit_cost$real_cost, transit_cost$country, mean)
## AE AR AT AU BD BE BG BH
## 6636.667 4646.000 1352.000 6237.600 12351.893 1170.000 1016.295 4882.500
## BR CA CH CL CN CZ DE DK
## 3665.000 3282.997 865.232 5015.000 4240.633 1519.040 733.160 3491.400
## EC EG ES FI FR GR HU ID
## 3819.000 5784.643 1357.250 1273.870 3868.451 1218.425 3579.840 2934.303
## IL IN IR IT JP KR KW MX
## 5038.400 6753.120 4640.000 971.415 2076.532 2384.063 30400.000 4783.390
## MY NL NO NZ PA PE PH PK
## 18035.500 4030.000 871.155 2991.663 4330.507 11088.400 8338.950 6039.000
## PL PT QA RO RU SA SE SG
## 1340.817 340.600 90000.000 1860.712 5095.583 13545.280 1072.340 19503.500
## TH TR TW UA UK US UZ VN
## 5834.779 1891.235 4977.106 2738.318 8441.067 4377.769 667.500 4756.626
transit_cost %>%
ggplot(aes(x = start_year, y = real_cost, color = as.factor(rr)) )+
geom_point() +
scale_color_manual(values=c("#999999", "#E69F00", "#569BBD"),
name="Railroad",
breaks=c(0, 1, NA),
labels=c("not railroad", "railroad", "missing"))
table(transit_cost$rr)
##
## 0 1
## 502 34
transit_cost %>%
ggplot(aes(x = length, y = real_cost, color = region) )+
geom_point()
transit_cost %>%
ggplot(aes(y = real_cost, x = region, color = region)) +
geom_boxplot()
praise()
## [1] "You are remarkable!"