There is some discussion in local news regarding decrease of precipitation and increase of temperature over last decades, however I haven´t seen single proof of these claims. The goal of this post is to get relevant data and check the trend.

library(tidyverse)
library(rvest)
library(lubridate)
library(RSelenium)
library(scales)

No open data? - scrap it

I thought our institutions are more open in providing wheater data (temperature and precipitation) so I was disappointed finding out the only option to get monthly data is to scrap it.

# initiate driver / client
driver<- rsDriver()

## [1] "Connecting to remote server"
## $acceptInsecureCerts
## [1] FALSE
## 
## $acceptSslCerts
## [1] FALSE
## 
## $applicationCacheEnabled
## [1] FALSE
## 
## $browserConnectionEnabled
## [1] FALSE
## 
## $browserName
## [1] "chrome"
## 
## $chrome
## $chrome$chromedriverVersion
## [1] "72.0.3626.7 (efcef9a3ecda02b2132af215116a03852d08b9cb)"
## 
## $chrome$userDataDir
## [1] "C:\\Users\\Petr\\AppData\\Local\\Temp\\scoped_dir4788_11482"
## 
## 
## $cssSelectorsEnabled
## [1] TRUE
## 
## $databaseEnabled
## [1] FALSE
## 
## $`goog:chromeOptions`
## $`goog:chromeOptions`$debuggerAddress
## [1] "localhost:59629"
## 
## 
## $handlesAlerts
## [1] TRUE
## 
## $hasTouchScreen
## [1] FALSE
## 
## $javascriptEnabled
## [1] TRUE
## 
## $locationContextEnabled
## [1] TRUE
## 
## $mobileEmulationEnabled
## [1] FALSE
## 
## $nativeEvents
## [1] TRUE
## 
## $networkConnectionEnabled
## [1] FALSE
## 
## $pageLoadStrategy
## [1] "normal"
## 
## $platform
## [1] "Windows NT"
## 
## $proxy
## named list()
## 
## $rotatable
## [1] FALSE
## 
## $setWindowRect
## [1] TRUE
## 
## $takesHeapSnapshot
## [1] TRUE
## 
## $takesScreenshot
## [1] TRUE
## 
## $timeouts
## $timeouts$implicit
## [1] 0
## 
## $timeouts$pageLoad
## [1] 300000
## 
## $timeouts$script
## [1] 30000
## 
## 
## $unexpectedAlertBehaviour
## [1] "ignore"
## 
## $version
## [1] "70.0.3538.110"
## 
## $webStorageEnabled
## [1] TRUE
## 
## $webdriver.remote.sessionid
## [1] "904a01b30cbb29ae5ecd79979171063c"
## 
## $id
## [1] "904a01b30cbb29ae5ecd79979171063c"

remDr <- driver[["client"]]

# navigate to root page
remDr$navigate("http://portal.chmi.cz/historicka-data/pocasi/uzemni-srazky")

# initiate list 
html_list <- list()

# loop over links -> navigate -> save html into list
for (i in seq_along(remDr$findElements("css", ".Html-link"))){
  remDr$navigate("http://portal.chmi.cz/historicka-data/pocasi/uzemni-srazky")
  Sys.sleep(2)
  elem <- remDr$findElements("css", ".Html-link")
  elem[[i]]$clickElement()
  Sys.sleep(2)
  
  html_list[[i]] <- read_html(remDr$getPageSource()[[1]])
}

# raw html - first two items from list
html_list[1:2]

## [[1]]
## {xml_document}
## <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="cs" lang="cs">
## [1] <head>\n<meta http-equiv="x-ua-compatible" content="IE=Edge">\n<titl ...
## [2] <body>\n    \n    <!-- horni prouzek MZP -->\n    <div id="prouzek_m ...
## 
## [[2]]
## {xml_document}
## <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="cs" lang="cs">
## [1] <head>\n<meta http-equiv="x-ua-compatible" content="IE=Edge">\n<titl ...
## [2] <body>\n    \n    <!-- horni prouzek MZP -->\n    <div id="prouzek_m ...

The list has no names. Each item represent one html for each year. In order to name the list items accordingly we have to extract the years from link elements first.

remDr$navigate("http://portal.chmi.cz/historicka-data/pocasi/uzemni-srazky")
years_names <- read_html(remDr$getPageSource()[[1]]) %>% 
  html_nodes(".Html-link") %>% 
  html_text()
unique(years_names)

##  [1] "1961"                   "1962"                  
##  [3] "1963"                   "1964"                  
##  [5] "1965"                   "1966"                  
##  [7] "1967"                   "1968"                  
##  [9] "1969"                   "1970"                  
## [11] "1971"                   "1972"                  
## [13] "1973"                   "1974"                  
## [15] "1975"                   "1976"                  
## [17] "1977"                   "1978"                  
## [19] "1979"                   "1980"                  
## [21] "1981"                   "1982"                  
## [23] "1983"                   "1984"                  
## [25] "1985"                   "1986"                  
## [27] "1987"                   "1988"                  
## [29] "1989"                   "1990"                  
## [31] "1991"                   "1992"                  
## [33] "1993"                   "1994"                  
## [35] "1995"                   "1996"                  
## [37] "1997"                   "1998"                  
## [39] "1999"                   "2000"                  
## [41] "2001"                   "2002"                  
## [43] "2003"                   "2004"                  
## [45] "2005"                   "2006"                  
## [47] "2007"                   "2008"                  
## [49] "2009"                   "2010"                  
## [51] "2011"                   "2012"                  
## [53] "2013"                   "2014"                  
## [55] "2015"                   "2016"                  
## [57] "2017"                   "2018 - operativní data"

We have a character vector with relevant years. Before naming the list with these years the “2018” needs to be fixed (replaced).

(years_names <- str_replace(years_names, "^.*operativní.*$", "2018"))

##   [1] "1961" "1962" "1963" "1964" "1965" "1966" "1967" "1968" "1969" "1970"
##  [11] "1971" "1972" "1973" "1974" "1975" "1976" "1977" "1978" "1979" "1980"
##  [21] "1981" "1982" "1983" "1984" "1985" "1986" "1987" "1988" "1989" "1990"
##  [31] "1991" "1992" "1993" "1994" "1995" "1996" "1997" "1998" "1999" "2000"
##  [41] "2001" "2002" "2003" "2004" "2005" "2006" "2007" "2008" "2009" "2010"
##  [51] "2011" "2012" "2013" "2014" "2015" "2016" "2017" "2018" "1961" "1962"
##  [61] "1963" "1964" "1965" "1966" "1967" "1968" "1969" "1970" "1971" "1972"
##  [71] "1973" "1974" "1975" "1976" "1977" "1978" "1979" "1980" "1981" "1982"
##  [81] "1983" "1984" "1985" "1986" "1987" "1988" "1989" "1990" "1991" "1992"
##  [91] "1993" "1994" "1995" "1996" "1997" "1998" "1999" "2000" "2001" "2002"
## [101] "2003" "2004" "2005" "2006" "2007" "2008" "2009" "2010" "2011" "2012"
## [111] "2013" "2014" "2015" "2016" "2017" "2018"

We´re done with scraping so the server and client can be stopped.

# close chrome
remDr$close()
#stop server
driver$server$stop()

## [1] TRUE

Extracting the table with data from each html.

tmp <- html_list %>% 
  set_names(years_names) %>% 
  #extract table from each html
  map(html_table, fill = TRUE) %>% 
  #extract second element (data.frame)
  map(~.[[2]])

tmp[1] %>% str

## List of 1
##  $ 1961:'data.frame':    43 obs. of  15 variables:
##   ..$ Kraj : chr [1:43] "Kraj" "Česká republika" "Česká republika" "Česká republika" ...
##   ..$      : chr [1:43] "" "S" "N" "%" ...
##   ..$ Měsíc: num [1:43] 1 22 42 52 15 32 47 14 34 41 ...
##   ..$ Měsíc: chr [1:43] "Měsíc" "45" "38" "118" ...
##   ..$ Měsíc: num [1:43] 2 43 40 108 36 36 100 36 39 92 ...
##   ..$ Měsíc: num [1:43] 3 50 47 106 60 43 140 66 49 135 ...
##   ..$ Měsíc: num [1:43] 4 85 74 115 84 70 120 85 75 113 ...
##   ..$ Měsíc: num [1:43] 5 94 84 112 70 75 93 114 94 121 ...
##   ..$ Měsíc: num [1:43] 6 73 79 92 56 72 78 49 83 59 ...
##   ..$ Měsíc: num [1:43] 7 64 78 82 70 73 96 83 82 101 ...
##   ..$ Měsíc: num [1:43] 8 31 52 60 32 46 70 41 51 80 ...
##   ..$ Měsíc: num [1:43] 9 50 42 119 44 36 122 50 37 135 ...
##   ..$ Měsíc: num [1:43] 10 48 49 98 36 40 90 41 43 95 ...
##   ..$ Měsíc: num [1:43] 11 46 48 96 30 35 86 46 39 118 ...
##   ..$ Rok  : num [1:43] 12 652 674 97 569 590 96 664 659 101 ...

Tidy the data

# create vector of column names
cols <- c("region", "parameter", "Jan", "Feb", "Mar", "Apr", "May", 
          "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "year_total")

tmp %>% 
  map(set_names, cols) %>% 
  map(as_tibble) %>% 
  map(~slice(., 2:nrow(.))) %>% 
  bind_rows(.id = "year") %>% 
  distinct(year, region, .keep_all = TRUE) %>% 
  
  filter(parameter == "S") %>% 
  select(-year_total, -parameter) %>% 
  
  gather(key = "month", value = "precipitation", -year, -region) %>% 
  
  mutate(precipitation = precipitation %>% as.numeric()) %>% 
  mutate(year_month = paste(month, year, sep = "-") %>% parse_date_time("%b-%Y", tz = "CET")) %>% 
  select(year_month, year, month, region, precipitation) -> df_prep_raw

df_prep_raw

## # A tibble: 9,744 x 5
##    year_month          year  month region              precipitation
##    <dttm>              <chr> <chr> <chr>                       <dbl>
##  1 1961-01-01 00:00:00 1961  Jan   Česká republika                22
##  2 1961-01-01 00:00:00 1961  Jan   Praha a Středočeský            15
##  3 1961-01-01 00:00:00 1961  Jan   Jihočeský                      14
##  4 1961-01-01 00:00:00 1961  Jan   Plzeňský                       22
##  5 1961-01-01 00:00:00 1961  Jan   Karlovarský                    39
##  6 1961-01-01 00:00:00 1961  Jan   Ústecký                        23
##  7 1961-01-01 00:00:00 1961  Jan   Liberecký                      41
##  8 1961-01-01 00:00:00 1961  Jan   Královéhradecký                38
##  9 1961-01-01 00:00:00 1961  Jan   Pardubický                     21
## 10 1961-01-01 00:00:00 1961  Jan   Vysočina                       16
## # ... with 9,734 more rows

saveRDS(df_prep_raw, "../../static/data/chmu_srazky.rds")

Now we´re ready to answer some questions with these data - see part II of this post.

Temperature change in Czech Republic - part I

No open data? - scrap it

Tidy the data