There is some discussion in local news regarding decrease of precipitation and increase of temperature over last decades, however I haven´t seen single proof of these claims. The goal of this post is to get relevant data and check the trend.
library(tidyverse)
library(rvest)
library(lubridate)
library(RSelenium)
library(scales)
No open data? - scrap it
I thought our institutions are more open in providing wheater data (temperature and precipitation) so I was disappointed finding out the only option to get monthly data is to scrap it.
# initiate driver / client
driver<- rsDriver()
## [1] "Connecting to remote server"
## $acceptInsecureCerts
## [1] FALSE
##
## $acceptSslCerts
## [1] FALSE
##
## $applicationCacheEnabled
## [1] FALSE
##
## $browserConnectionEnabled
## [1] FALSE
##
## $browserName
## [1] "chrome"
##
## $chrome
## $chrome$chromedriverVersion
## [1] "72.0.3626.7 (efcef9a3ecda02b2132af215116a03852d08b9cb)"
##
## $chrome$userDataDir
## [1] "C:\\Users\\Petr\\AppData\\Local\\Temp\\scoped_dir4788_11482"
##
##
## $cssSelectorsEnabled
## [1] TRUE
##
## $databaseEnabled
## [1] FALSE
##
## $`goog:chromeOptions`
## $`goog:chromeOptions`$debuggerAddress
## [1] "localhost:59629"
##
##
## $handlesAlerts
## [1] TRUE
##
## $hasTouchScreen
## [1] FALSE
##
## $javascriptEnabled
## [1] TRUE
##
## $locationContextEnabled
## [1] TRUE
##
## $mobileEmulationEnabled
## [1] FALSE
##
## $nativeEvents
## [1] TRUE
##
## $networkConnectionEnabled
## [1] FALSE
##
## $pageLoadStrategy
## [1] "normal"
##
## $platform
## [1] "Windows NT"
##
## $proxy
## named list()
##
## $rotatable
## [1] FALSE
##
## $setWindowRect
## [1] TRUE
##
## $takesHeapSnapshot
## [1] TRUE
##
## $takesScreenshot
## [1] TRUE
##
## $timeouts
## $timeouts$implicit
## [1] 0
##
## $timeouts$pageLoad
## [1] 300000
##
## $timeouts$script
## [1] 30000
##
##
## $unexpectedAlertBehaviour
## [1] "ignore"
##
## $version
## [1] "70.0.3538.110"
##
## $webStorageEnabled
## [1] TRUE
##
## $webdriver.remote.sessionid
## [1] "904a01b30cbb29ae5ecd79979171063c"
##
## $id
## [1] "904a01b30cbb29ae5ecd79979171063c"
remDr <- driver[["client"]]
# navigate to root page
remDr$navigate("http://portal.chmi.cz/historicka-data/pocasi/uzemni-srazky")
# initiate list
html_list <- list()
# loop over links -> navigate -> save html into list
for (i in seq_along(remDr$findElements("css", ".Html-link"))){
remDr$navigate("http://portal.chmi.cz/historicka-data/pocasi/uzemni-srazky")
Sys.sleep(2)
elem <- remDr$findElements("css", ".Html-link")
elem[[i]]$clickElement()
Sys.sleep(2)
html_list[[i]] <- read_html(remDr$getPageSource()[[1]])
}
# raw html - first two items from list
html_list[1:2]
## [[1]]
## {xml_document}
## <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="cs" lang="cs">
## [1] <head>\n<meta http-equiv="x-ua-compatible" content="IE=Edge">\n<titl ...
## [2] <body>\n \n <!-- horni prouzek MZP -->\n <div id="prouzek_m ...
##
## [[2]]
## {xml_document}
## <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="cs" lang="cs">
## [1] <head>\n<meta http-equiv="x-ua-compatible" content="IE=Edge">\n<titl ...
## [2] <body>\n \n <!-- horni prouzek MZP -->\n <div id="prouzek_m ...
The list has no names. Each item represent one html for each year. In order to name the list items accordingly we have to extract the years from link elements first.
remDr$navigate("http://portal.chmi.cz/historicka-data/pocasi/uzemni-srazky")
years_names <- read_html(remDr$getPageSource()[[1]]) %>%
html_nodes(".Html-link") %>%
html_text()
unique(years_names)
## [1] "1961" "1962"
## [3] "1963" "1964"
## [5] "1965" "1966"
## [7] "1967" "1968"
## [9] "1969" "1970"
## [11] "1971" "1972"
## [13] "1973" "1974"
## [15] "1975" "1976"
## [17] "1977" "1978"
## [19] "1979" "1980"
## [21] "1981" "1982"
## [23] "1983" "1984"
## [25] "1985" "1986"
## [27] "1987" "1988"
## [29] "1989" "1990"
## [31] "1991" "1992"
## [33] "1993" "1994"
## [35] "1995" "1996"
## [37] "1997" "1998"
## [39] "1999" "2000"
## [41] "2001" "2002"
## [43] "2003" "2004"
## [45] "2005" "2006"
## [47] "2007" "2008"
## [49] "2009" "2010"
## [51] "2011" "2012"
## [53] "2013" "2014"
## [55] "2015" "2016"
## [57] "2017" "2018 - operativní data"
We have a character vector with relevant years. Before naming the list with these years the “2018” needs to be fixed (replaced).
(years_names <- str_replace(years_names, "^.*operativní.*$", "2018"))
## [1] "1961" "1962" "1963" "1964" "1965" "1966" "1967" "1968" "1969" "1970"
## [11] "1971" "1972" "1973" "1974" "1975" "1976" "1977" "1978" "1979" "1980"
## [21] "1981" "1982" "1983" "1984" "1985" "1986" "1987" "1988" "1989" "1990"
## [31] "1991" "1992" "1993" "1994" "1995" "1996" "1997" "1998" "1999" "2000"
## [41] "2001" "2002" "2003" "2004" "2005" "2006" "2007" "2008" "2009" "2010"
## [51] "2011" "2012" "2013" "2014" "2015" "2016" "2017" "2018" "1961" "1962"
## [61] "1963" "1964" "1965" "1966" "1967" "1968" "1969" "1970" "1971" "1972"
## [71] "1973" "1974" "1975" "1976" "1977" "1978" "1979" "1980" "1981" "1982"
## [81] "1983" "1984" "1985" "1986" "1987" "1988" "1989" "1990" "1991" "1992"
## [91] "1993" "1994" "1995" "1996" "1997" "1998" "1999" "2000" "2001" "2002"
## [101] "2003" "2004" "2005" "2006" "2007" "2008" "2009" "2010" "2011" "2012"
## [111] "2013" "2014" "2015" "2016" "2017" "2018"
We´re done with scraping so the server and client can be stopped.
# close chrome
remDr$close()
#stop server
driver$server$stop()
## [1] TRUE
Extracting the table with data from each html.
tmp <- html_list %>%
set_names(years_names) %>%
#extract table from each html
map(html_table, fill = TRUE) %>%
#extract second element (data.frame)
map(~.[[2]])
tmp[1] %>% str
## List of 1
## $ 1961:'data.frame': 43 obs. of 15 variables:
## ..$ Kraj : chr [1:43] "Kraj" "Česká republika" "Česká republika" "Česká republika" ...
## ..$ : chr [1:43] "" "S" "N" "%" ...
## ..$ Měsíc: num [1:43] 1 22 42 52 15 32 47 14 34 41 ...
## ..$ Měsíc: chr [1:43] "Měsíc" "45" "38" "118" ...
## ..$ Měsíc: num [1:43] 2 43 40 108 36 36 100 36 39 92 ...
## ..$ Měsíc: num [1:43] 3 50 47 106 60 43 140 66 49 135 ...
## ..$ Měsíc: num [1:43] 4 85 74 115 84 70 120 85 75 113 ...
## ..$ Měsíc: num [1:43] 5 94 84 112 70 75 93 114 94 121 ...
## ..$ Měsíc: num [1:43] 6 73 79 92 56 72 78 49 83 59 ...
## ..$ Měsíc: num [1:43] 7 64 78 82 70 73 96 83 82 101 ...
## ..$ Měsíc: num [1:43] 8 31 52 60 32 46 70 41 51 80 ...
## ..$ Měsíc: num [1:43] 9 50 42 119 44 36 122 50 37 135 ...
## ..$ Měsíc: num [1:43] 10 48 49 98 36 40 90 41 43 95 ...
## ..$ Měsíc: num [1:43] 11 46 48 96 30 35 86 46 39 118 ...
## ..$ Rok : num [1:43] 12 652 674 97 569 590 96 664 659 101 ...
Tidy the data
# create vector of column names
cols <- c("region", "parameter", "Jan", "Feb", "Mar", "Apr", "May",
"Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "year_total")
tmp %>%
map(set_names, cols) %>%
map(as_tibble) %>%
map(~slice(., 2:nrow(.))) %>%
bind_rows(.id = "year") %>%
distinct(year, region, .keep_all = TRUE) %>%
filter(parameter == "S") %>%
select(-year_total, -parameter) %>%
gather(key = "month", value = "precipitation", -year, -region) %>%
mutate(precipitation = precipitation %>% as.numeric()) %>%
mutate(year_month = paste(month, year, sep = "-") %>% parse_date_time("%b-%Y", tz = "CET")) %>%
select(year_month, year, month, region, precipitation) -> df_prep_raw
df_prep_raw
## # A tibble: 9,744 x 5
## year_month year month region precipitation
## <dttm> <chr> <chr> <chr> <dbl>
## 1 1961-01-01 00:00:00 1961 Jan Česká republika 22
## 2 1961-01-01 00:00:00 1961 Jan Praha a Středočeský 15
## 3 1961-01-01 00:00:00 1961 Jan Jihočeský 14
## 4 1961-01-01 00:00:00 1961 Jan Plzeňský 22
## 5 1961-01-01 00:00:00 1961 Jan Karlovarský 39
## 6 1961-01-01 00:00:00 1961 Jan Ústecký 23
## 7 1961-01-01 00:00:00 1961 Jan Liberecký 41
## 8 1961-01-01 00:00:00 1961 Jan Královéhradecký 38
## 9 1961-01-01 00:00:00 1961 Jan Pardubický 21
## 10 1961-01-01 00:00:00 1961 Jan Vysočina 16
## # ... with 9,734 more rows
saveRDS(df_prep_raw, "../../static/data/chmu_srazky.rds")
Now we´re ready to answer some questions with these data - see part II of this post.