Inputting Data

Today’s data was on transit costs. Because I don’t know a lot about transit, we spent the time, just exploring the variables.

transit_cost <- readr::read_csv("transit_cost.csv") %>%
  mutate(real_cost = as.numeric(real_cost), start_year = as.numeric(start_year)) %>%
  filter(!is.na(line)) %>%
  mutate(region = countrycode(country, origin = "ecb", 
                                    destination = "region")) %>%
  mutate(region = case_when(country == "UK" ~ "Europe & Central Asia",
                            TRUE ~ region))
head(transit_cost)
## # A tibble: 6 x 21
##       e country city  line  start_year end_year    rr length tunnel_per tunnel
##   <dbl> <chr>   <chr> <chr>      <dbl> <chr>    <dbl>  <dbl> <chr>       <dbl>
## 1  7136 CA      Vanc… Broa…       2020 2025         0    5.7 87.72%        5  
## 2  7137 CA      Toro… Vaug…       2009 2017         0    8.6 100.00%       8.6
## 3  7138 CA      Toro… Scar…       2020 2030         0    7.8 100.00%       7.8
## 4  7139 CA      Toro… Onta…       2020 2030         0   15.5 57.00%        8.8
## 5  7144 CA      Toro… Yong…       2020 2030         0    7.4 100.00%       7.4
## 6  7145 NL      Amst… Nort…       2003 2018         0    9.7 73.00%        7.1
## # … with 11 more variables: stations <dbl>, source1 <chr>, cost <dbl>,
## #   currency <chr>, year <dbl>, ppp_rate <dbl>, real_cost <dbl>,
## #   cost_km_millions <dbl>, source2 <chr>, reference <chr>, region <chr>
tail(transit_cost)
## # A tibble: 6 x 21
##       e country city  line  start_year end_year    rr length tunnel_per tunnel
##   <dbl> <chr>   <chr> <chr>      <dbl> <chr>    <dbl>  <dbl> <chr>       <dbl>
## 1  9507 TR      Ista… M5 P…       2016 2022         0   17.8 100.00%      17.8
## 2  9508 TR      Ista… M12         2017 2022         0   13   100.00%      13  
## 3  9509 TR      Ista… M11 …       2016 2021         0   37.5 100.00%      37.5
## 4  9510 TR      Ista… M11 …       2019 2022         0   32   100.00%      32  
## 5  9459 UZ      Tash… Serg…       2017 2020         0    7.1 0.00%         7.1
## 6  9460 UZ      Tash… Yunu…       2017 2020         0    2.9 100.00%       2.9
## # … with 11 more variables: stations <dbl>, source1 <chr>, cost <dbl>,
## #   currency <chr>, year <dbl>, ppp_rate <dbl>, real_cost <dbl>,
## #   cost_km_millions <dbl>, source2 <chr>, reference <chr>, region <chr>
transit_cost %>%
  group_by(country) %>%
  summarize(mean(real_cost))
## # A tibble: 56 x 2
##    country `mean(real_cost)`
##    <chr>               <dbl>
##  1 AE                  6637.
##  2 AR                  4646 
##  3 AT                  1352 
##  4 AU                  6238.
##  5 BD                 12352.
##  6 BE                  1170 
##  7 BG                  1016.
##  8 BH                  4882.
##  9 BR                  3665 
## 10 CA                  3283.
## # … with 46 more rows
tapply(transit_cost$real_cost, transit_cost$country, mean)
##        AE        AR        AT        AU        BD        BE        BG        BH 
##  6636.667  4646.000  1352.000  6237.600 12351.893  1170.000  1016.295  4882.500 
##        BR        CA        CH        CL        CN        CZ        DE        DK 
##  3665.000  3282.997   865.232  5015.000  4240.633  1519.040   733.160  3491.400 
##        EC        EG        ES        FI        FR        GR        HU        ID 
##  3819.000  5784.643  1357.250  1273.870  3868.451  1218.425  3579.840  2934.303 
##        IL        IN        IR        IT        JP        KR        KW        MX 
##  5038.400  6753.120  4640.000   971.415  2076.532  2384.063 30400.000  4783.390 
##        MY        NL        NO        NZ        PA        PE        PH        PK 
## 18035.500  4030.000   871.155  2991.663  4330.507 11088.400  8338.950  6039.000 
##        PL        PT        QA        RO        RU        SA        SE        SG 
##  1340.817   340.600 90000.000  1860.712  5095.583 13545.280  1072.340 19503.500 
##        TH        TR        TW        UA        UK        US        UZ        VN 
##  5834.779  1891.235  4977.106  2738.318  8441.067  4377.769   667.500  4756.626
transit_cost %>%
  ggplot(aes(x = start_year, y = real_cost, color = as.factor(rr)) )+ 
  geom_point() +
  scale_color_manual(values=c("#999999", "#E69F00", "#569BBD"), 
                       name="Railroad",
                       breaks=c(0, 1, NA),
                       labels=c("not railroad", "railroad", "missing"))

table(transit_cost$rr)
## 
##   0   1 
## 502  34
transit_cost %>%
  ggplot(aes(x = length, y = real_cost, color = region) )+ 
  geom_point() 

transit_cost %>%
  ggplot(aes(y = real_cost, x = region, color = region)) + 
  geom_boxplot() 

praise()
## [1] "You are remarkable!"