R& Base de Données

tidyverse

if (!"tidyverse"%in%rownames(installed.packages())) {
  install.packages("tidyverse")
}
library(tidyverse)
tidyverse_update()
library(tidyverse)
tidyverse_conflicts()
tibble(
  x = 1:5, 
  y = 1, 
  z = x ^ 2 + y
)
#> # A tibble: 5 x 3
#>       x     y     z
#>   <int> <dbl> <dbl>
#> 1     1     1     2
#> 2     2     1     5
#> 3     3     1    10
#> # ... with 2 more rows
data.frame(
  x = 1:5, 
  y = 1, 
  z = x ^ 2 + y
)
#> Error in data.frame(x = 1:5, y = 1, z = x^2 + y): object 'x' not found
data.frame(`1`= 1:3)
#>   X1
#> 1  1
#> 2  2
#> 3  3
tibble(
  `;)` = 1:5, 
  `42` = "1", 
  `€` = `;)` ^ 2 + as.numeric(`42`)
)
#> # A tibble: 5 x 3
#>    `;)` `42`    `€`
#>   <int> <chr> <dbl>
#> 1     1 1         2
#> 2     2 1         5
#> 3     3 1        10
#> # ... with 2 more rows
as.data.frame(mtcars)
#>                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#> Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
#> Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
#> Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
#> Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
#> Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
#> Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
#> Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
#> Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
#> Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
#> Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
#> Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
#> Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
#> Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
#> Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
#> Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
#> Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
#> Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
#> Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
#> Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
#> Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
#> Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
#> Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
#> AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
#> Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
#> Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
#> Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
#> Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
#> Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
#> Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
#> Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
#> Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
#> Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
print(as.data.frame(mtcars))
#>                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
#> Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
#> Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
#> Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
#> Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
#> Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
#> Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
#> Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
#> Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
#> Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
#> Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
#> Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
#> Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
#> Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
#> Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
#> Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
#> Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
#> Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
#> Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
#> Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
#> Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
#> Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
#> Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
#> AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
#> Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
#> Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
#> Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
#> Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
#> Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
#> Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
#> Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
#> Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
#> Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
as_tibble(mtcars)
#> # A tibble: 32 x 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#> * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # ... with 29 more rows
print(as_tibble(mtcars))
#> # A tibble: 32 x 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#> * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # ... with 29 more rows
as_tibble(rownames_to_column(mtcars))
#> # A tibble: 32 x 12
#>   rowname        mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4     21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2 Mazda RX4 W…  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3 Datsun 710    22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # ... with 29 more rows
print(as_tibble(rownames_to_column(mtcars)))
#> # A tibble: 32 x 12
#>   rowname        mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <chr>        <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4     21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2 Mazda RX4 W…  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3 Datsun 710    22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # ... with 29 more rows
options(tibble.print_max = n, tibble.print_min = m, dplyr.print_min = p)
print(x = DF, n = n, width = p))
View(mtcars)
mtcars$mpg
#>  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
#> [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
#> [31] 15.0 21.4
mtcars[["mpg"]]
#>  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
#> [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
#> [31] 15.0 21.4
mtcars[, "mpg"]
#>  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
#> [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
#> [31] 15.0 21.4
as_tibble(mtcars)$mpg
#>  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
#> [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
#> [31] 15.0 21.4
as_tibble(mtcars)[["mpg"]]
#>  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
#> [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
#> [31] 15.0 21.4
as_tibble(mtcars)[, "mpg"]
#> # A tibble: 32 x 1
#>     mpg
#>   <dbl>
#> 1  21  
#> 2  21  
#> 3  22.8
#> # ... with 29 more rows
tb <- as_tibble(mtcars)
class(tb)
#> [1] "tbl_df"     "tbl"        "data.frame"
class(as.data.frame(tb))
#> [1] "data.frame"
df <- data.frame(abc = 1, xyz = "a")
df$xyz
df[["xyz"]]
df[, "xyz"]
df[, c("abc", "xyz")]
x <- "abc"
class(dta)
df1 <- data.frame(abc = 1, xyz = "a")
df2 <- as_tibble(df1)
df1$xyz
df1[["xyz"]]
df1[, "xyz"]
df1[, c("abc", "xyz")]
x <- "abc"
df1[, x]
df1[[x]]
x <- "abc"
df2[, x]
df2[[x]]
dta <- tibble(
  `1` = 1:10,
  `2` = `1` * 2 + rnorm(length(`1`))
)
dta <- tibble(
  `1` = 1:10,
  `2` = `1` * 2 + rnorm(length(`1`))
)
dta$`1`
plot(dta$`1`, dta$`2`)
ggplot(data = dta, aes(x = `1`, y = `2`)) + geom_point(colour = "white")
dta <- tibble(
  `1` = 1:10,
  `2` = `1` * 2 + rnorm(length(`1`)),
  `3` = `2`/ `1`
)
dta <- tibble(
  "un" = 1:10,
  "deux" = un * 2 + rnorm(length(un)),
  "trois" = deux / un
)
vignette(package = "tibble")
library(pryr)
library(tidyverse)
tidyverse_conflicts()
x <- rnorm(25)
x1 <- sqrt(x)
x2 <- na.exclude(x1)
x3 <- x2<1
x4 <- table(x3)
x5 <- prop.table(x4)
x6 <- which.max(x5)
x7 <- names(x6)
mtcars2 <- mtcars %>% mutate(cyl_fac = factor(cyl))

pryr::object_size(mtcars)
pryr::object_size(mtcars2)
pryr::object_size(mtcars, mtcars2)
x <- rnorm(25)

x <- sqrt(x)
x <- na.exclude(x)
x <- x<1
x <- table(x)
x <- prop.table(x)
x <- which.max(x)
x <- names(x)
x <- rnorm(25)

names(
  which.max(
    prop.table(
      table(
        na.exclude(
          sqrt(x)
        )<1
      )
    )
  )
)

names(which.max(prop.table(table(na.exclude(sqrt(x))<1))))
x %>% 
  sqrt() %>% 
  na.exclude() %>% 
  `<`(1) %>% 
  table() %>% 
  prop.table() %>% 
  which.max() %>% 
  names()
vignette(package = "magrittr")
library(readxl)
library(tidyverse)
tidyverse_conflicts()
locale()
#> <locale>
#> Numbers:  123,456.78
#> Formats:  %AD / %AT
#> Timezone: UTC
#> Encoding: UTF-8
#> <date_names>
#> Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
#>         (Thu), Friday (Fri), Saturday (Sat)
#> Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
#>         June (Jun), July (Jul), August (Aug), September (Sep), October
#>         (Oct), November (Nov), December (Dec)
#> AM/PM:  AM/PM
ratings <- read.csv(
  file = "./materials/ratings.csv"
)
str(ratings[0, ])
#> 'data.frame':    0 obs. of  13 variables:
#>  $ Const         : chr 
#>  $ Your.Rating   : int 
#>  $ Date.Rated    : chr 
#>  $ Title         : chr 
#>  $ URL           : chr 
#>  $ Title.Type    : chr 
#>  $ IMDb.Rating   : num 
#>  $ Runtime..mins.: int 
#>  $ Year          : int 
#>  $ Genres        : chr 
#>  $ Num.Votes     : int 
#>  $ Release.Date  : chr 
#>  $ Directors     : chr
ratings <- read_csv(
  file = "./materials/ratings.csv"
)
#> Parsed with column specification:
#> cols(
#>   Const = col_character(),
#>   `Your Rating` = col_double(),
#>   `Date Rated` = col_date(format = ""),
#>   Title = col_character(),
#>   URL = col_character(),
#>   `Title Type` = col_character(),
#>   `IMDb Rating` = col_double(),
#>   `Runtime (mins)` = col_double(),
#>   Year = col_double(),
#>   Genres = col_character(),
#>   `Num Votes` = col_double(),
#>   `Release Date` = col_date(format = ""),
#>   Directors = col_character()
#> )
read.csv(
  text = "a,b,c
    1,2,3
    4,5,6"
)
#>   a b c
#> 1 1 2 3
#> 2 4 5 6
read_csv(
  "a,b,c
  1,2,3
  4,5,6"
)
#> # A tibble: 2 x 3
#>       a     b     c
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6
read_csv("Une ligne de métadonnées
  x,y,z
  1,2,3", skip = 1)
#> # A tibble: 1 x 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
read_csv("# Une ligne de commentaires
  x,y,z
  1,2,3", comment = "#")
#> # A tibble: 1 x 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
read_csv("1,2,3\n4,5,6", col_names = FALSE)
#> # A tibble: 2 x 3
#>      X1    X2    X3
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6
read_csv("1,2,3\n4,5,6", col_names = c("a", "b", "c"))
#> # A tibble: 2 x 3
#>       a     b     c
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6
read_csv("a,b,c\n1,2,3\n4,.,6", na = ".")
#> # A tibble: 2 x 3
#>       a     b     c
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4    NA     6
read_csv(
  "a,b,c\n1,x,2018-05-03\n4,y,2018-05-04", 
  col_types = list(col_integer(), col_character(), col_date(format = "%Y-%m-%d"))
)
#> # A tibble: 2 x 3
#>       a b     c         
#>   <int> <chr> <date>    
#> 1     1 x     2018-05-03
#> 2     4 y     2018-05-04
"x,y\n1,'a,b'"
read_csv("a,b\n1,2,3\n4,5,6")
read_csv("a,b,c\n1,2\n1,2,3,4")
read_csv("a,b\n\"1")
read_csv("a,b\n1,2\na,b")
read_csv("a;b\n1;3")
read_delim(file = path, delim = "|")
read_csv("x,y\n1,'a,b'", quote = "\'")
read_delim("x,y\n1,'a,b'", delim = ",", quote = "\'")
read_csv("a,b\n1,2,3\n4,5,6")
read_csv("a,b,c\n1,2\n1,2,3,4")
read_csv("a,b\n\"1")
read_csv("a,b\n1,2\na,b")
read_csv("a;b\n1;3")
readr::write_csv(x = ratings, path = "ratings.csv")
writexl::write_xlsx(x = ratings, path = "ratings.xlsx")
library(tidyverse)
tidyverse_conflicts()
ratings <- read_csv(file = "./materials/ratings.csv")
filter(ratings, `Your Rating` >= 9, `IMDb Rating` >= 9)

#> # A tibble: 1 x 13
#>   Const `Your Rating` `Date Rated` Title URL   `Title Type` `IMDb Rating`
#>   <chr>         <dbl> <date>       <chr> <chr> <chr>                <dbl>
#> 1 tt01…             9 2014-11-08   The … http… movie                  9.3
#> # ... with 6 more variables: `Runtime (mins)` <dbl>, Year <dbl>, Genres <chr>,
#> #   `Num Votes` <dbl>, `Release Date` <date>, Directors <chr>
ratings %>% 
  filter(`Your Rating` >= 9 & `IMDb Rating` >= 9)

#> # A tibble: 1 x 13
#>   Const `Your Rating` `Date Rated` Title URL   `Title Type` `IMDb Rating`
#>   <chr>         <dbl> <date>       <chr> <chr> <chr>                <dbl>
#> 1 tt01…             9 2014-11-08   The … http… movie                  9.3
#> # ... with 6 more variables: `Runtime (mins)` <dbl>, Year <dbl>, Genres <chr>,
#> #   `Num Votes` <dbl>, `Release Date` <date>, Directors <chr>
ratings %>% 
  filter(`Num Votes`==max(`Num Votes`))
ratings %>% 
  filter(`Num Votes`==max(`Num Votes`))
ratings %>% 
  filter(grepl(pattern = "shark", x = Title, ignore.case = TRUE))
ratings %>% 
  filter(`Num Votes`==max(`Num Votes`))
ratings %>% 
  filter(grepl(pattern = "shark", x = Title, ignore.case = TRUE))
ratings %>% 
  filter(`Date Rated`>="2018-04-25" & `Date Rated`<="2018-05-03")
ratings %>% 
  filter(`Num Votes`==max(`Num Votes`))
ratings %>% 
  filter(grepl(pattern = "shark", x = Title, ignore.case = TRUE))
ratings %>% 
  filter(`Date Rated`>="2018-04-25" & `Date Rated`<="2018-05-03")
ratings %>% 
  filter(between(`Date Rated`, as.Date("2018-04-25"), as.Date("2018-05-03")))
ratings %>% 
  arrange(desc(`Date Rated`))

#> # A tibble: 2,244 x 13
#>    Const `Your Rating` `Date Rated` Title URL   `Title Type` `IMDb Rating`
#>    <chr>         <dbl> <date>       <chr> <chr> <chr>                <dbl>
#>  1 tt22…             2 2018-05-01   Sher… http… movie                  4.8
#>  2 tt04…             3 2018-04-30   The … http… movie                  5.8
#>  3 tt51…             6 2018-04-30   Pete… http… movie                  6.6
#>  4 tt64…             4 2018-04-30   Happ… http… movie                  5.7
#>  5 tt68…             4 2018-04-30   The … http… movie                  5.1
#>  6 tt46…             3 2018-04-27   Lovi… http… movie                  6  
#>  7 tt76…             4 2018-04-26   Amou… http… movie                  5  
#>  8 tt41…             7 2018-04-25   Aven… http… movie                  9  
#>  9 tt68…             4 2018-04-23   Je n… http… movie                  6.4
#> 10 tt69…             6 2018-04-23   Plac… http… movie                  6.1
#> # ... with 2,234 more rows, and 6 more variables: `Runtime (mins)` <dbl>,
#> #   Year <dbl>, Genres <chr>, `Num Votes` <dbl>, `Release Date` <date>,
#> #   Directors <chr>
ratings %>% 
  arrange(desc(`IMDb Rating`), desc(`Your Rating`))
ratings %>% 
  arrange(Title)
ratings %>% 
  arrange(desc(`Date Rated`)) %>% 
  select(Title, ends_with("Rating"))

#> # A tibble: 2,244 x 3
#>   Title              `Your Rating` `IMDb Rating`
#>   <chr>                      <dbl>         <dbl>
#> 1 Sherlock Gnomes                2           4.8
#> 2 The Heartbreak Kid             3           5.8
#> 3 Peter Rabbit                   6           6.6
#> 4 Happy Anniversary              4           5.7
#> # ... with 2,240 more rows
ratings %>% 
  select(contains("Date"))
ratings %>% 
  select(4, 2, 3, 12)
ratings %>% 
  arrange(desc(`Date Rated`)) %>% 
  mutate(Rating_is_better = `Your Rating`>=`IMDb Rating`) %>% 
  select(Title, `Your Rating`, `IMDb Rating`, Rating_is_better)
#> # A tibble: 2,244 x 4
#>    Title                          `Your Rating` `IMDb Rating` Rating_is_better
#>    <chr>                                  <dbl>         <dbl> <lgl>           
#>  1 Sherlock Gnomes                            2           4.8 FALSE           
#>  2 The Heartbreak Kid                         3           5.8 FALSE           
#>  3 Peter Rabbit                               6           6.6 FALSE           
#>  4 Happy Anniversary                          4           5.7 FALSE           
#>  5 The Week Of                                4           5.1 FALSE           
#>  6 Loving Pablo                               3           6   FALSE           
#>  7 Amoureux de ma femme                       4           5   FALSE           
#>  8 Avengers: Infinity War                     7           9   FALSE           
#>  9 Je ne suis pas un homme facile             4           6.4 FALSE           
#> 10 Place publique                             6           6.1 FALSE           
#> # ... with 2,234 more rows
ratings %>% 
  mutate(`Avg Rating` = (`Your Rating`+`IMDb Rating`)/2)
ratings %>% 
  mutate(
    Month = lubridate::month(`Date Rated`),
    Day = lubridate::day(`Date Rated`)
  )
read_csv(file = "./materials/ratings.csv") %>%  
  summarise(
    n_movies = n(),
    my_average_rating = mean(`Your Rating`),
    oldest_rating = min(`Date Rated`),
    newest_rating =  max(`Date Rated`)
  )

#> # A tibble: 1 x 4
#>   n_movies my_average_rating oldest_rating newest_rating
#>      <int>             <dbl> <date>        <date>       
#> 1     2244              5.60 2014-11-08    2018-05-01
ratings %>% 
  summarise(
    mean = mean(`IMDb Rating`),
    sd = sd(`IMDb Rating`),
    min = min(`IMDb Rating`),
    max = max(`IMDb Rating`)
  )
ratings %>% 
  summarise_at(
    .vars = vars(`IMDb Rating`), 
    .funs = funs(mean, sd, min, max)
  )
ratings %>% 
  summarise(
    ndays = max(`Date Rated`)-min(`Date Rated`)
  )
read_csv(file = "./materials/ratings.csv") %>% 
  mutate(
    Year = lubridate::year(`Date Rated`)
  ) %>% 
  filter(Year!="2014") %>% 
  group_by(Year) %>% 
  summarise(
    n_movies = n(),
    my_average_rating = mean(`Your Rating`),
    users_average_rating = mean(`IMDb Rating`)
  ) %>% 
  arrange(desc(Year))
#> # A tibble: 4 x 4
#>    Year n_movies my_average_rating users_average_rating
#>   <dbl>    <int>             <dbl>                <dbl>
#> 1  2018       67              4.88                 6.35
#> 2  2017      204              5.12                 6.43
#> 3  2016      238              5.13                 6.48
#> 4  2015      374              5.07                 6.19
read_csv(file = "./materials/ratings.csv") %>% 
  mutate(
    Year = lubridate::year(`Date Rated`)
  ) %>% 
  filter(Year=="2017") %>% 
  group_by(`Your Rating`) %>% 
  summarise(
    n_movies = n(),
    runtime_all = sum(`Runtime (mins)`)
  ) %>% 
  arrange(desc(runtime_all))
vignette(package = "dplyr")
library(DBI)
library(odbc)
tidyverse_conflicts()
CREATE ROLE test_user LOGIN password 'test_pwds';
CREATE DATABASE test_data;
ALTER DATABASE test_data OWNER TO 'test_user';
odbcListDrivers() %>% 
  filter(attribute=="Description") %>% 
  select(-attribute)
#>                 name                                    value
#> 1             SQLite                       SQLite ODBC Driver
#> 2            SQLite3                      SQLite3 ODBC Driver
#> 3    PostgreSQL ANSI    PostgreSQL ODBC driver (ANSI version)
#> 4 PostgreSQL Unicode PostgreSQL ODBC driver (Unicode version)
con <- dbConnect(
  drv = odbc(),
  driver = "PostgreSQL Unicode",
  server = "localhost",
  port = 5432,
  database = "postgres",
  uid = "postgres",
  password = "password"
)
con <- dbConnect(
  drv = odbc(),
  driver = "PostgreSQL Unicode",
  server = "localhost",
  port = 5432,
  database = "postgres",
  uid = "postgres",
  password = rstudioapi::askForPassword("Mot de passe :")
)
dbSendQuery(con, SQL("CREATE ROLE test_user LOGIN password 'test_pwd';"))
#> <OdbcResult>
#>   SQL  CREATE ROLE test_user LOGIN password 'test_pwd';
#>   ROWS Fetched: 0 [complete]
#>        Changed: -1469428304
dbSendQuery(con, SQL("CREATE DATABASE test_data;"))
#> Warning in new_result(connection@ptr, statement): Cancelling previous query
#> <OdbcResult>
#>   SQL  CREATE DATABASE test_data;
#>   ROWS Fetched: 0 [complete]
#>        Changed: -1469428304
dbSendQuery(con, SQL("ALTER DATABASE test_data OWNER TO test_user;"))
#> Warning in new_result(connection@ptr, statement): Cancelling previous query
#> <OdbcResult>
#>   SQL  ALTER DATABASE test_data OWNER TO test_user;
#>   ROWS Fetched: 0 [complete]
#>        Changed: -1469428304
dbDisconnect(con)
#> Warning in connection_release(conn@ptr): There is a result object still in use.
#> The connection will be automatically released when it is closed
con <- dbConnect(
  drv = odbc(),
  driver = "PostgreSQL Unicode",
  server = "localhost",
  database = "test_data",
  port = 5432,
  uid = "test_user",
  password = "test_pwd"
)
dbListTables(con)
#> character(0)
mtcars %>% 
  rownames_to_column(var = "car") %>% 
  mutate(const = gsub(" .*", "", car)) %>% 
  select(const, everything()) %>% 
  dbWriteTable(con, "mtcars", .)

dbListTables(con)
#> [1] "mtcars"
dbListFields(con, "mtcars")
#>  [1] "const" "car"   "mpg"   "cyl"   "disp"  "hp"    "drat"  "wt"    "qsec" 
#> [10] "vs"    "am"    "gear"  "carb"
dbReadTable(con, "mtcars") %>% 
  as_tibble()
#> # A tibble: 32 x 13
#>   const  car     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda  Mazd…  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2 Mazda  Mazd…  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3 Datsun Dats…  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> 4 Hornet Horn…  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1
#> 5 Hornet Horn…  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2
#> # ... with 27 more rows
res <- dbSendQuery(con, "SELECT * FROM mtcars WHERE const='Merc'")
dbFetch(res)
#>   const         car  mpg cyl  disp  hp drat   wt qsec vs am gear carb
#> 1  Merc   Merc 240D 24.4   4 146.7  62 3.69 3.19 20.0  1  0    4    2
#> 2  Merc    Merc 230 22.8   4 140.8  95 3.92 3.15 22.9  1  0    4    2
#> 3  Merc    Merc 280 19.2   6 167.6 123 3.92 3.44 18.3  1  0    4    4
#> 4  Merc   Merc 280C 17.8   6 167.6 123 3.92 3.44 18.9  1  0    4    4
#> 5  Merc  Merc 450SE 16.4   8 275.8 180 3.07 4.07 17.4  0  0    3    3
#> 6  Merc  Merc 450SL 17.3   8 275.8 180 3.07 3.73 17.6  0  0    3    3
#> 7  Merc Merc 450SLC 15.2   8 275.8 180 3.07 3.78 18.0  0  0    3    3
dbClearResult(res)
dbDisconnect(con)
con <- dbConnect(
  drv = odbc(),
  driver = "PostgreSQL Unicode",
  server = "localhost",
  database = "postgres",
  port = 5432,
  uid = "postgres",
  password = "password"
)
dbSendQuery(con, SQL("CREATE ROLE data_user LOGIN password 'data_pwd';"))
dbSendQuery(con, SQL("CREATE DATABASE datawarehouse;"))
dbSendQuery(con, SQL("ALTER DATABASE datawarehouse OWNER TO data_user;"))
dbDisconnect(con)
ratings <- read_csv(file = "./materials/ratings.csv", locale = locale(encoding = )) %>% 
  `colnames<-`(gsub(" ", "", colnames(.)))
con <- dbConnect(
  drv = odbc(),
  driver = "PostgreSQL Unicode",
  server = "localhost",
  database = "datawarehouse",
  port = 5432,
  uid = "data_user",
  password = "data_pwd"
)
dbWriteTable(con, "ratings", ratings)
dbListFields(con, 'ratings')
dbSendQuery(
  conn = con, 
  statement = SQL(
    'SELECT "YourRating", COUNT(*) as count FROM ratings GROUP BY "YourRating";'
  )
) %>% 
  dbFetch() %>% 
  as_tibble()
dbSendQuery(
  conn = con, 
  statement = SQL(
    'SELECT * 
      FROM ratings 
      WHERE "DateRated">=\'2017-01-01\' AND "DateRated"<=\'2017-12-31\';'
  )
) %>% 
  dbFetch() %>% 
  dbWriteTable(con, "ratings2017", .)

dbListTables(con)
vignette(package = "DBI")
library(DBI)
library(odbc)
library(tidyverse)
library(dbplyr)
tidyverse_conflicts()
con <- dbConnect(RSQLite::SQLite(), path = ":memory:")
con <- dbConnect(
  drv = odbc(),
  driver = "PostgreSQL Unicode",
  server = "localhost",
  database = "datawarehouse",
  port = 5432,
  uid = "data_user",
  password = "data_pwd"
)
ratings <- read_csv(file = "./materials/ratings.csv") %>% 
  `colnames<-`(gsub(" ", "", colnames(.))) %>% 
  mutate(Year = lubridate::year(DateRated))
copy_to(
  dest = con, 
  df = ratings, 
  name = "ratings_idx",
  temporary = FALSE, 
  indexes = list(
    "DateRated", 
    "YourRating", 
    "Title", 
    "Year"
  ),
  overwrite = TRUE
)
ratings_db <- tbl(con, "ratings")
class(ratings_db)
#> [1] "tbl_dbi"  "tbl_sql"  "tbl_lazy" "tbl"
ratings_db
#> # Source:   table<ratings> [?? x 13]
#> # Database: postgres [data_user@localhost:/datawarehouse]
#>   Const YourRating DateRated  Title URL   TitleType IMDbRating `Runtime(mins)`
#>   <chr>      <dbl> <date>     <chr> <chr> <chr>          <dbl>           <dbl>
#> 1 tt10…          5 2014-12-17 He's… http… movie            6.4             129
#> 2 tt10…          7 2014-11-08 Mega… http… movie            7.3              95
#> 3 tt01…          3 2014-11-26 The … http… movie            5.1              90
#> 4 tt10…          7 2014-12-26 Ca$h  http… movie            6               100
#> # ... with more rows, and 5 more variables: Year <dbl>, Genres <chr>,
#> #   NumVotes <dbl>, ReleaseDate <date>, Directors <chr>
date_sevendaysago <- (Sys.Date()-7)

ratings_db %>% 
  filter(DateRated > date_sevendaysago) %>% 
  select(Title, DateRated, YourRating, IMDbRating)
#> # Source:   lazy query [?? x 4]
#> # Database: postgres [data_user@localhost:/datawarehouse]
#> # ... with 4 variables: Title <chr>, DateRated <date>, YourRating <dbl>,
#> #   IMDbRating <dbl>
ratings_db %>% 
  group_by(Year) %>% 
  summarise(
    N = n(),
    AvgRating = mean(YourRating)
  ) %>% 
  arrange(Year)
#> Warning: Missing values are always removed in SQL.
#> Use `AVG(x, na.rm = TRUE)` to silence this warning
#> # Source:     lazy query [?? x 3]
#> # Database:   postgres [data_user@localhost:/datawarehouse]
#> # Ordered by: Year
#>    Year N               AvgRating
#>   <dbl> <S3: integer64>     <dbl>
#> 1  1936 1                       8
#> 2  1937 1                       7
#> 3  1940 1                       7
#> 4  1941 1                       7
#> # ... with more rows
ratings_summary <- ratings %>% 
  group_by(Year) %>% 
  summarise(
    N = n(),
    AvgRating = mean(YourRating)
  ) %>% 
  arrange(Year)
str(ratings_summary, 1)
#> Classes 'tbl_df', 'tbl' and 'data.frame':    5 obs. of  3 variables:
#>  $ Year     : num  2014 2015 2016 2017 2018
#>  $ N        : int  1361 374 238 204 67
#>  $ AvgRating: num  5.93 5.07 5.13 5.12 4.88
ratings_summary_db <- ratings_db %>% 
  group_by(Year) %>% 
  summarise(
    N = n(),
    AvgRating = mean(YourRating)
  ) %>% 
  arrange(Year)
str(ratings_summary_db, 1)
#> List of 2
#>  $ src:List of 2
#>   ..- attr(*, "class")= chr [1:3] "src_dbi" "src_sql" "src"
#>  $ ops:List of 4
#>   ..- attr(*, "class")= chr [1:3] "op_arrange" "op_single" "op"
#>  - attr(*, "class")= chr [1:4] "tbl_dbi" "tbl_sql" "tbl_lazy" "tbl"
ratings_summary_db %>% 
  print()
#> Warning: Missing values are always removed in SQL.
#> Use `AVG(x, na.rm = TRUE)` to silence this warning
#> # Source:     lazy query [?? x 3]
#> # Database:   postgres [data_user@localhost:/datawarehouse]
#> # Ordered by: Year
#>    Year N               AvgRating
#>   <dbl> <S3: integer64>     <dbl>
#> 1  1936 1                       8
#> 2  1937 1                       7
#> 3  1940 1                       7
#> 4  1941 1                       7
#> # ... with more rows
ratings_summary_db %>% 
  collect()
#> Warning: Missing values are always removed in SQL.
#> Use `AVG(x, na.rm = TRUE)` to silence this warning
#> # A tibble: 66 x 3
#>    Year N               AvgRating
#>   <dbl> <S3: integer64>     <dbl>
#> 1  1936 1                       8
#> 2  1937 1                       7
#> 3  1940 1                       7
#> 4  1941 1                       7
#> # ... with 62 more rows
 ratings_summary_db %>% 
  show_query()
#> Warning: Missing values are always removed in SQL.
#> Use `AVG(x, na.rm = TRUE)` to silence this warning
#> <SQL>
#> SELECT "Year", COUNT(*) AS "N", AVG("YourRating") AS "AvgRating"
#> FROM "ratings"
#> GROUP BY "Year"
#> ORDER BY "Year"
translate_sql(head(ratings_db))
#> <SQL> HEAD("ratings_db")
translate_sql(as.numeric(x))
#> <SQL> CAST("x" AS NUMERIC)
translate_sql(as.character(x))
#> <SQL> CAST("x" AS TEXT)
translate_sql(x^2)
#> <SQL> POWER("x", 2.0)
tbl(con, sql('select 1 as x')) %>%
  mutate(sqr = x^2)
#> # Source:   lazy query [?? x 2]
#> # Database: postgres [data_user@localhost:/datawarehouse]
#>       x   sqr
#>   <int> <dbl>
#> 1     1     1
translate_sql(mean(x))
#> Warning: Missing values are always removed in SQL.
#> Use `avg(x, na.rm = TRUE)` to silence this warning
#> <SQL> avg("x") OVER ()
translate_sql(mean(x, trim = 0.05))
#> Error in mean(x, trim = 0.05): unused argument (trim = 0.05)
translate_sql(mean(x, na.rm = TRUE))
#> <SQL> avg("x") OVER ()
ratings_summary <- ratings_summary_db %>% 
  collect()
#> Warning: Missing values are always removed in SQL.
#> Use `AVG(x, na.rm = TRUE)` to silence this warning
class(ratings_summary)
#> [1] "tbl_df"     "tbl"        "data.frame"
ratings_summary_db %>% head(1)
#> # Source:     lazy query [?? x 3]
#> # Database:   postgres [data_user@localhost:/datawarehouse]
#> # Ordered by: Year
#>    Year N               AvgRating
#>   <dbl> <S3: integer64>     <dbl>
#> 1  1936 1                       8
ratings_summary_db %>% tail()
#> Error: tail() is not supported by sql sources
ratings_summary_db %>% nrow()
#> [1] NA
tally(ratings_summary_db)
#> Warning: Missing values are always removed in SQL.
#> Use `AVG(x, na.rm = TRUE)` to silence this warning
#> # Source:   lazy query [?? x 1]
#> # Database: postgres [data_user@localhost:/datawarehouse]
#>   n              
#>   <S3: integer64>
#> 1 66
explain(ratings_summary_db)
#> Warning: Missing values are always removed in SQL.
#> Use `AVG(x, na.rm = TRUE)` to silence this warning
#> <SQL>
#> SELECT "Year", COUNT(*) AS "N", AVG("YourRating") AS "AvgRating"
#> FROM "ratings"
#> GROUP BY "Year"
#> ORDER BY "Year"
#> 
#> Warning: Missing values are always removed in SQL.
#> Use `AVG(x, na.rm = TRUE)` to silence this warning
#> <PLAN>
#> Sort  (cost=100.59..101.09 rows=200 width=24)
#>   Sort Key: ratings."Year"
#>   ->  HashAggregate  (cost=88.45..90.95 rows=200 width=24)
#>         Group Key: ratings."Year"
#>         ->  Seq Scan on ratings  (cost=0.00..75.40 rows=1740 width=16)
con <- dbConnect(
  drv = odbc(),
  driver = "PostgreSQL Unicode",
  server = "localhost",
  database = "datawarehouse",
  port = 5432,
  uid = "data_user",
  password = "data_pwd"
)
copy_to(con, mtcars, "mtcars", temporary = TRUE, overwrite = TRUE)
dbListTables(con)
#> [1] "ratings"     "ratings2017" "ratings_idx"
copy_to(con, mtcars, "mtcars", temporary = FALSE, overwrite = TRUE)
dbListTables(con)
#> [1] "mtcars"      "ratings"     "ratings2017" "ratings_idx"
dbWriteTable(con, "mtcars", mtcars, overwrite = TRUE)
dbListTables(con)
#> [1] "mtcars"      "ratings"     "ratings2017" "ratings_idx"
mtcars %>% 
  rownames_to_column(var = "car") %>% 
  mutate(const = gsub(" .*", "", car)) %>% 
  select(const, everything()) %>% 
  dbWriteTable(con, "mtcars", ., overwrite = TRUE)
library(nycflights13)
dbWriteTable(con, "flights", flights)
dbWriteTable(con, "airports", airports)
dbWriteTable(con, "planes", planes)
dbWriteTable(con, "weather", weather)
dbWriteTable(con, "airlines", airlines)

dbListTables(con)
library(nycflights13)
nycflights13_db <- ls("package:nycflights13") %>% 
  lapply(., function(df) { 
    copy_to(
      df = get(df), 
      dest = con, 
      name = df, 
      overwrite = TRUE, 
      temporary = TRUE
    ) 
  }) %>% 
  `names<-`(ls("package:nycflights13"))
dbListTables(con)
#> [1] "mtcars"      "ratings"     "ratings2017" "ratings_idx"
longest_air_time_db <- nycflights13_db[["flights"]] %>%
  inner_join(nycflights13_db[["planes"]], by = "tailnum") %>%
  group_by(manufacturer) %>% 
  summarise(total_air_time = sum(air_time)/60) %>% 
  arrange(desc(total_air_time)) %>%
  select(manufacturer, total_air_time) %>% 
  head(3)
longest_air_time_db
#> # Source:     lazy query [?? x 2]
#> # Database:   postgres [data_user@localhost:/datawarehouse]
#> # Ordered by: desc(total_air_time)
#>   manufacturer     total_air_time
#>   <chr>                     <dbl>
#> 1 BOEING                  296118.
#> 2 AIRBUS                  153993.
#> 3 AIRBUS INDUSTRIE         96442.
longest_air_time_db %>% show_query()
#> <SQL>
#> SELECT "manufacturer", "total_air_time"
#> FROM (SELECT *
#> FROM (SELECT "manufacturer", SUM("air_time") / 60.0 AS "total_air_time"
#> FROM (SELECT "TBL_LEFT"."year" AS "year.x", "TBL_LEFT"."month" AS "month", "TBL_LEFT"."day" AS "day", "TBL_LEFT"."dep_time" AS "dep_time", "TBL_LEFT"."sched_dep_time" AS "sched_dep_time", "TBL_LEFT"."dep_delay" AS "dep_delay", "TBL_LEFT"."arr_time" AS "arr_time", "TBL_LEFT"."sched_arr_time" AS "sched_arr_time", "TBL_LEFT"."arr_delay" AS "arr_delay", "TBL_LEFT"."carrier" AS "carrier", "TBL_LEFT"."flight" AS "flight", "TBL_LEFT"."tailnum" AS "tailnum", "TBL_LEFT"."origin" AS "origin", "TBL_LEFT"."dest" AS "dest", "TBL_LEFT"."air_time" AS "air_time", "TBL_LEFT"."distance" AS "distance", "TBL_LEFT"."hour" AS "hour", "TBL_LEFT"."minute" AS "minute", "TBL_LEFT"."time_hour" AS "time_hour", "TBL_RIGHT"."year" AS "year.y", "TBL_RIGHT"."type" AS "type", "TBL_RIGHT"."manufacturer" AS "manufacturer", "TBL_RIGHT"."model" AS "model", "TBL_RIGHT"."engines" AS "engines", "TBL_RIGHT"."seats" AS "seats", "TBL_RIGHT"."speed" AS "speed", "TBL_RIGHT"."engine" AS "engine"
#>   FROM "flights" AS "TBL_LEFT"
#>   INNER JOIN "planes" AS "TBL_RIGHT"
#>   ON ("TBL_LEFT"."tailnum" = "TBL_RIGHT"."tailnum")
#> ) "tiddcnycav"
#> GROUP BY "manufacturer") "lcgofypsma"
#> ORDER BY "total_air_time" DESC) "udepcvudse"
#> LIMIT 3
nycflights13::weather %>% 
  group_by(origin, year) %>%  
  summarise(avg_temp = mean(temp))
#> # A tibble: 3 x 3
#> # Groups:   origin [?]
#>   origin  year avg_temp
#>   <chr>  <dbl>    <dbl>
#> 1 EWR     2013     NA  
#> 2 JFK     2013     54.5
#> 3 LGA     2013     55.8
nycflights13_db[["weather"]] %>% 
  group_by(origin, year) %>% 
  summarise(avg_temp = mean(temp))
nycflights13::weather %>% 
  group_by(origin, year) %>%  
  summarise(avg_temp = mean(temp))
nycflights13_db[["weather"]] %>% 
  group_by(origin, year) %>% 
  summarise(avg_temp = mean(temp))
#> # Source:   lazy query [?? x 3]
#> # Database: postgres [data_user@localhost:/datawarehouse]
#> # Groups:   origin
#>   origin  year avg_temp
#>   <chr>  <dbl>    <dbl>
#> 1 JFK     2013     54.5
#> 2 EWR     2013     55.5
#> 3 LGA     2013     55.8
#> # ... with more rows
nycflights13_db[["flights"]] %>% 
  filter(dest %like% 'A%') %>% 
  summarise(count = n_distinct(dest))
#> # Source:   lazy query [?? x 1]
#> # Database: postgres [data_user@localhost:/datawarehouse]
#>   count          
#>   <S3: integer64>
#> 1 7
nycflights13_db[["flights"]] %>%
  mutate(origin_dest = origin %|| '-' ||% dest) %>%
  select(origin, dest, origin_dest)
#> # Source:   lazy query [?? x 3]
#> # Database: postgres [data_user@localhost:/datawarehouse]
#>   origin dest  origin_dest
#>   <chr>  <chr> <chr>      
#> 1 EWR    IAH   EWR-IAH    
#> 2 LGA    IAH   LGA-IAH    
#> # ... with more rows
nycflights13_db[["flights"]] %>%
  mutate(origin_dest = CONCAT(origin, "-", dest)) %>%
  select(origin, dest, origin_dest)
nycflights13_db[["flights"]] %>%
  mutate(origin_dest = paste(origin, dest, sep = "-")) %>%
  select(origin, dest, origin_dest)
nycflights13_db[["flights"]] %>%
  mutate(origin_dest = origin %|| '-' ||% dest) %>%
  select(origin, dest, origin_dest)
nycflights13_db[["flights"]] %>%
  mutate(origin_dest = CONCAT(origin, "-", dest)) %>%
  select(origin, dest, origin_dest)
#> # Source:   lazy query [?? x 3]
#> # Database: postgres [data_user@localhost:/datawarehouse]
#>   origin dest  origin_dest
#>   <chr>  <chr> <chr>      
#> 1 EWR    IAH   EWR-IAH    
#> 2 LGA    IAH   LGA-IAH    
#> # ... with more rows
nycflights13_db[["flights"]] %>%
  mutate(origin_dest = paste(origin, dest, sep = "-")) %>%
  select(origin, dest, origin_dest)
nycflights13_db[["flights"]] %>%
  mutate(origin_dest = origin %|| '-' ||% dest) %>%
  select(origin, dest, origin_dest)
nycflights13_db[["flights"]] %>%
  mutate(origin_dest = CONCAT(origin, "-", dest)) %>%
  select(origin, dest, origin_dest)
nycflights13_db[["flights"]] %>%
  mutate(origin_dest = paste(origin, dest, sep = "-")) %>%
  select(origin, dest, origin_dest)
#> # Source:   lazy query [?? x 3]
#> # Database: postgres [data_user@localhost:/datawarehouse]
#>   origin dest  origin_dest
#>   <chr>  <chr> <chr>      
#> 1 EWR    IAH   EWR-IAH    
#> 2 LGA    IAH   LGA-IAH    
#> # ... with more rows
dbSendQuery(con, 'CREATE SCHEMA nycflights13;')
dbWriteTable(con, SQL("nycflights13.flights"), flights)
tbl(con, "mtcars") %>% head()
#> # Source:   lazy query [?? x 13]
#> # Database: postgres [data_user@localhost:/datawarehouse]
#>   const  car     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda  Mazd…  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2 Mazda  Mazd…  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3 Datsun Dats…  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> 4 Hornet Horn…  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1
#> 5 Hornet Horn…  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2
#> # ... with more rows
tbl(con, in_schema("public", "mtcars")) %>% head()
#> # Source:   lazy query [?? x 13]
#> # Database: postgres [data_user@localhost:/datawarehouse]
#>   const  car     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda  Mazd…  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2 Mazda  Mazd…  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3 Datsun Dats…  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> 4 Hornet Horn…  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1
#> 5 Hornet Horn…  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2
#> # ... with more rows
tbl(con, in_schema("nycflights13", "flights")) %>% head(1)
tbl(con, in_schema("nycflights13", "weather")) %>% head(1)
library(xml2)
library(rvest)
library(DBI)
library(odbc)
library(tidyverse)
tidyverse_conflicts()
read_html()
html_nodes()
imdb_xml %>% 
  html_nodes("h1") %>% 
  html_text()
rpo_cast_url <- "https://www.imdb.com/title/tt1677720/fullcredits/"
rpo_cast_list <- read_html(rpo_cast_url) %>% 
  html_nodes("table.cast_list")
rpo_character_list <- rpo_cast_list %>% 
  html_nodes("td.character")
html_text(rpo_character_list) %>% head(2)
#> [1] "\n            Parzival /  \n            Wade \n                  \n          "   
#> [2] "\n            Art3mis /  \n            Samantha \n                  \n          "
rpo_character_list %>% 
  html_text() %>% 
  gsub("\n", "", .) %>% 
  gsub(" +", " ", .) %>% 
  gsub("^ (.*) $", "\\1", .) %>% 
  head(2)
#> [1] "Parzival / Wade"    "Art3mis / Samantha"
cast_tbl <- rpo_cast_list %>% 
  html_table() %>% 
  .[[1]] %>% 
  select(2, 4) %>% 
  as_tibble() %>% 
  rename(Actor = X2, Character = X4) %>% 
  mutate(
    Character = Character %>% 
    gsub("\n", "", .) %>% 
    gsub(" +", " ", .) %>% 
    gsub("^ (.*) $", "\\1", .)
  ) %>% 
  filter(!grepl("Rest of cast listed alphabetically:", Actor)) %>% 
  filter(Actor!="")

head(cast_tbl, 3)
#> # A tibble: 3 x 2
#>   Actor          Character         
#>   <chr>          <chr>             
#> 1 Tye Sheridan   Parzival / Wade   
#> 2 Olivia Cooke   Art3mis / Samantha
#> 3 Ben Mendelsohn Sorrento
ratings <- read_csv("materials/ratings.csv") %>% 
  `colnames<-`(gsub(" ", "", colnames(.))) %>% 
  mutate(
    Year = lubridate::year(DateRated),
    Month = lubridate::month(DateRated),
    Day = lubridate::wday(DateRated)
  )
con <- dbConnect(
  drv = odbc(),
  driver = "PostgreSQL Unicode",
  server = "localhost",
  database = "postgres",
  port = 5432,
  uid = "postgres",
  password = "password"
)
dbSendQuery(con, SQL("CREATE ROLE imdb_user LOGIN password 'imdb_pwd';"))
dbSendQuery(con, SQL("CREATE DATABASE movies;"))
dbSendQuery(con, SQL("ALTER DATABASE movies OWNER TO imdb_user;"))
dbDisconnect(con)
con <- dbConnect(
  drv = odbc(),
  driver = "PostgreSQL Unicode",
  server = "localhost",
  database = "movies",
  port = 5432,
  uid = "imdb_user",
  password = "imdb_pwd"
)
ratings_db <- copy_to(
  dest = con, 
  df = ratings, 
  names = "ratings", 
  overwrite = TRUE,
  temporary = FALSE
)
cast_list <- ratings_db %>% 
  filter(Title %like% "Ready Player One") %>% 
  collect() %>% 
  .[["URL"]] %>% 
  paste0(., "fullcredits/") %>% 
  read_html() %>% 
  html_nodes("table.cast_list")
cast_tbl <- rpo_cast_list %>% 
  html_table() %>% 
  .[[1]] %>% 
  select(2, 4) %>% 
  as_tibble() %>% 
  rename(Actor = X2, Character = X4) %>% 
  mutate(
    Character = Character %>% 
    gsub("\n", "", .) %>% 
    gsub(" +", " ", .) %>% 
    gsub("^ (.*) $", "\\1", .)
  ) %>% 
  filter(!grepl("Rest of cast listed alphabetically:", Actor)) %>% 
  filter(Actor!="")
get_cast <- function(x) {
  x %>% 
    paste0(., "fullcredits/") %>% 
    read_html() %>% 
    html_nodes("table.cast_list") %>% 
    html_table() %>% 
    .[[1]] %>% 
    select(2, 4) %>% 
    as_tibble() %>% 
    rename(Actor = X2, Character = X4) %>% 
    mutate(
      Character = Character %>% 
      gsub("\n", "", .) %>% 
      gsub(" +", " ", .) %>% 
      gsub("^ (.*) $", "\\1", .)
    ) %>% 
    filter(!grepl("Rest of cast listed alphabetically:", Actor)) %>% 
    filter(Actor!="") %>% 
    mutate(url = x)
}
get_cast("https://www.imdb.com/title/tt0100240/")
#> # A tibble: 36 x 3
#>   Actor             Character url                                  
#>   <chr>             <chr>     <chr>                                
#> 1 Jonathan Brandis  Bastian   https://www.imdb.com/title/tt0100240/
#> 2 Kenny Morrison    Atreyu    https://www.imdb.com/title/tt0100240/
#> 3 Clarissa Burt     Xayide    https://www.imdb.com/title/tt0100240/
#> 4 John Wesley Shipp Barney    https://www.imdb.com/title/tt0100240/
#> 5 Martin Umbach     Nimbly    https://www.imdb.com/title/tt0100240/
#> # ... with 31 more rows
casting <- ratings_db %>% 
  select(URL) %>% 
  collect() %>% 
  head() %>% 
  mutate(
    Casting = map(URL, get_cast)
  )
casting
#> # A tibble: 6 x 2
#>   URL                                   Casting           
#>   <chr>                                 <list>            
#> 1 https://www.imdb.com/title/tt1001508/ <tibble [121 × 3]>
#> 2 https://www.imdb.com/title/tt1001526/ <tibble [19 × 3]> 
#> 3 https://www.imdb.com/title/tt0100240/ <tibble [36 × 3]> 
#> 4 https://www.imdb.com/title/tt1002966/ <tibble [43 × 3]> 
#> 5 https://www.imdb.com/title/tt0100403/ <tibble [67 × 3]> 
#> # ... with 1 more row
movies_list <- "https://www.imdb.com/user/ur56341222/ratings" %>% 
  read_html() %>% 
  html_nodes("div#ratings-container.lister-list") %>% 
  html_nodes("div.lister-item.mode-detail")
title <- movies_list %>% 
  html_nodes("h3.lister-item-header") %>% 
  html_text() %>% 
  gsub("\n", "", .) %>% 
  gsub(" [0-9]+.", "", .) %>% 
  gsub(" +", " ", .) %>% 
  gsub("^ (.*) $", "\\1", .)
movies_list <- "https://www.imdb.com/user/ur56341222/ratings" %>% 
  read_html() %>% 
  html_nodes("div#ratings-container.lister-list") %>% 
  html_nodes("div.lister-item.mode-detail")
genres <- movies_list %>% 
  html_nodes("span.genre") %>% 
  html_text() %>% 
  gsub("\n", "", .) %>% 
  gsub(" +", " ", .) %>% 
  gsub("^ (.*) $", "\\1", .)
movies_list <- "https://www.imdb.com/user/ur56341222/ratings" %>% 
  read_html() %>% 
  html_nodes("div#ratings-container.lister-list") %>% 
  html_nodes("div.lister-item.mode-detail")
runtime <- movies_list %>% 
  html_nodes("span.runtime") %>% 
  html_text() %>% 
  gsub("\n", "", .) %>% 
  gsub(" +", " ", .) %>% 
  gsub("^ (.*) $", "\\1", .)
movies_list <- "https://www.imdb.com/user/ur56341222/ratings" %>% 
  read_html() %>% 
  html_nodes("div#ratings-container.lister-list") %>% 
  html_nodes("div.lister-item.mode-detail")
rating <- movies_list %>% 
  html_nodes("div.ipl-rating-widget") %>% 
  html_nodes("div.ipl-rating-star.small") %>% 
  html_text("span.ipl-rating-star__rating") %>% 
  unique() %>% 
  gsub("\n", "", .) %>% 
  gsub(" +", " ", .) %>% 
  gsub("^ (.*) $", "\\1", .) %>% 
  as.numeric() %>% 
  (function(.x) {if (length(.x)!=2) {c(NA, NA)} else {.x}})
get_movies_info <- function(x) {
  out <- c(
    x %>% html_nodes("h3.lister-item-header") %>% html_text() %>% gsub("\n", "", .) %>% 
      gsub(" [0-9]+.", "", .) %>% gsub(" +", " ", .) %>% gsub("^ (.*) $", "\\1", .),

    x %>% html_nodes("span.genre") %>% html_text() %>% 
      gsub("\n", "", .) %>% gsub(" +", " ", .) %>% gsub("^ (.*) $", "\\1", .),

    x %>% html_nodes("span.runtime") %>% html_text() %>% gsub("\n", "", .) %>% 
      gsub(" +", " ", .) %>% gsub("^ (.*) $", "\\1", .) %>% ifelse(length(.)==0, NA, .),

    x %>% html_nodes("div.ipl-rating-widget") %>% 
      html_nodes("div.ipl-rating-star.small") %>% 
      html_text("span.ipl-rating-star__rating") %>% 
      gsub("\n", "", .) %>% gsub(" +", " ", .) %>% gsub("^ (.*) $", "\\1", .) %>% 
      as.numeric() %>%(function(.x) {if (length(.x)!=2) {c(NA, NA)} else {.x}})
  )
  names(out) <- c("title", "genres", "runtime", "imdbrating", "userating")
  return(data.frame(t(as.matrix(out))))
}
movies_list <- "https://www.imdb.com/user/ur56341222/ratings" %>% 
  read_html() %>% 
  html_nodes("div#ratings-container.lister-list") %>% 
  html_nodes("div.lister-item.mode-detail") %>% 
  map_df(.f = get_movies_info)
p <- read_csv("materials/ratings.csv") %>% 
  select(`Your Rating`, `IMDb Rating`) %>% 
  gather(data = ., key = Who, value = Rating) %>% 
  ggplot(aes(x = round(Rating, digits = 0), fill = Who)) +
    geom_density(
      aes(x = Rating, y = (..count../sum(..count..))*100),
      bw = 1, 
      alpha = 0.75, 
      colour = "white"
    ) +
    geom_histogram(
      aes(y = ..count../sum(..count..)),
      binwidth = 0.5,
      colour = "white",
      position = position_dodge2()
    ) +
    scale_x_continuous(name = "Rating", expand = c(0, 0), limits = c(0, 10), breaks = c(0, seq_len(10))) +
    scale_y_continuous(expand = expand_scale(mult = c(0, 0.05)), labels = percent) +
    scale_fill_viridis_d(name = NULL) +
    labs(x = "Rating", y = "Proportion", title = "Distribution of Ratings") +
    theme(legend.position = c(0, 1), legend.justification = c(-0.05, 1.05))
devtools::source_gist("https://gist.github.com/mcanouil/2bb6df3817b01049d4f2a4541de5e6de")

Type de données	Package	Alternatives
Fichier "Texte"	*readr*	base, data.table
Excel	*readxl*	gdata, openxlsx, XLConnect, xlsx
Logiciel Statistique	*haven*	foreign, sas7bdat, readstata13

Les Bases de Données et R

Préparer sa session

Trucs et astuces

Le tidyverse

Qu'est-ce-que le tidyverse ?

Qu'est-ce-que le tidyverse ?

tibble

comme un data.frame, mais en mieux !

Pré-requis

Construire un tibble

Construire un tibble

Afficher un data.frame

Afficher un tibble

Afficher un tibble

Afficher un tibble : les options

Sélectionner une variable : $, [ et [[

Sélectionner une variable : $, [ et [[

Assurer la rétro-compatibilité du code

Exercices (Partie I)

Exercices (Partie I)

Exercices (Partie I)

Exercices (Partie I)

Exercices (Partie II)

Exercices (Partie II)

Exercices (Partie II)

Pour aller plus loin …

%>%

magrittr : "Ceci n'est pas un pipe."

Pré-requis

Qu'est-ce-que le "pipe" ?

Pourquoi utiliser le "pipe" ?

Conserver les étapes

Remplacer l'objet original

Composer l'appel aux fonctions

Utiliser le "pipe"

Pour aller plus loin …

readr & readxl

Importer des données

Pré-requis

Les fonctions de base de readr

Par rapport à celles de R : Entrée

Par rapport à celles de R : Sortie

Dans la pratique

Lire un CSV avec base

Dans la pratique

Lire un CSV avec readr

Dans la pratique

Lire un "texte" avec base

Dans la pratique

Lire un "texte" avec readr

Les arguments de readr

Les arguments de readr

Les arguments de readr

Exercices

Exercices

Exercices

Exercices

Les alternatives à readr

Quelques benchmark !

Quelques benchmark : Lecture

Quelques benchmark : Ecriture

Exporter des données vers un fichier

dplyr

Une grammaire de la manipulation des données

dplyr : Le pipe et des fonctions utiles

Pré-requis

Les fonctions de base

Les fonctions de base

Sélectionner des lignes

Exercices

Exercices

Exercices

Exercices

Exercices

Réordonner des lignes

Exercices

Exercices

Sélectionner des colonnes

Exercices

Exercices

Sélectionner une variable : `$`, `[` et `[[`

Sélectionner une variable : `$`, `[` et `[[`

Utiliser `group_by()`

dplyr > SQL > tbl ? > `collect()` > tbl

dplyr > SQL > tbl ? > `collect()` > tbl

dplyr, `explain()` to me