Emilio Berti
23/09/2022
\(\,\)
## date km time
## 1 10/27/2021 6.0 0:34:28
## 2 10/27/2021 6.0 0:33:24
## 3 10/31/2021 10.3 0:59:19
## 4 11/6/2021 8.7 0:54:00
## 5 11/13/2021 8.7 0:49:00
## 6 11/17/2021 4.6 0:25:28
## 7 11/21/2021 10.4 1:02:00
## 8 11/30/2021 6.0 0:34:08
## 9 11/30/2021 6.0 0:33:03
## 10 12/5/2021 6.2 0:35:00
## 11 12/9/2021 6.2 0:37:33
## 12 12/19/2021 6.2 0:35:22
## 13 12/22/2021 6.2 0:36:00
## 14 12/28/2021 6.2 0:35:00
## 15 12/31/2021 6.2 0:33:12
## 16 1/2/2022 6.2 0:33:43
## 17 1/12/2022 6.0 0:33:44
## 18 1/12/2022 6.0 0:33:45
## 19 1/16/2022 6.2 0:33:52
## 20 1/19/2022 6.0 0:33:38
## 21 1/19/2022 6.0 0:31:56
## 22 1/23/2022 6.2 0:34:26
## 23 1/26/2022 6.0 0:33:34
## 24 1/26/2022 6.0 0:32:19
## 25 1/30/2022 6.2 0:32:44
## 26 2/2/2022 6.0 0:33:59
## 27 2/2/2022 6.0 0:33:41
## 28 2/6/2022 6.2 0:34:35
## 29 2/9/2022 6.0 0:32:33
## 30 2/9/2022 6.0 0:32:20
## 31 2/12/2022 8.7 0:48:35
## 32 2/16/2022 6.2 0:34:02
## 33 2/19/2022 8.7 0:46:40
## 34 2/23/2022 6.0 0:34:03
## 35 2/23/2022 6.0 0:30:40
## 36 2/28/2022 8.1 0:43:18
## 37 3/2/2022 6.4 0:36:16
## 38 3/3/2022 6.4 0:38:22
## 39 3/9/2022 6.2 0:30:33
## 40 3/12/2022 8.7 0:45:20
## 41 3/16/2022 6.0 0:29:48
## 42 3/16/2022 6.0 0:29:48
## 43 3/19/2022 6.2 0:33:39
## 44 3/21/2022 6.0 0:30:57
## 45 3/21/2022 6.0 0:28:40
## 46 3/27/2022 8.7 0:47:03
## 47 3/30/2022 6.0 0:32:03
## 48 3/30/2022 6.0 0:30:03
## 49 4/3/2022 6.2 0:34:06
## 50 4/7/2022 6.0 0:29:59
## 51 4/7/2022 6.0 0:32:22
## # A tibble: 51 × 3
## date km time
## <chr> <dbl> <chr>
## 1 10/27/2021 6 0:34:28
## 2 10/27/2021 6 0:33:24
## 3 10/31/2021 10.3 0:59:19
## 4 11/6/2021 8.7 0:54:00
## 5 11/13/2021 8.7 0:49:00
## 6 11/17/2021 4.6 0:25:28
## 7 11/21/2021 10.4 1:02:00
## 8 11/30/2021 6 0:34:08
## 9 11/30/2021 6 0:33:03
## 10 12/5/2021 6.2 0:35:00
## # … with 41 more rows
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.6 4.6 4.6 4.6 4.6 4.6
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.6 4.6 4.6 4.6 4.6 4.6
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.6 4.6 4.6 4.6 4.6 4.6
In SQL:
%>%
.Total distance and best time for each route - on the fly.
read_csv("kalenji.csv", show_col_types = FALSE) %>%
transmute(
time = minute(time) + hour(time) * 60, #total minutes
Distance = round(km) #round distance
) %>%
group_by(Distance) %>% #evalute by group
summarize(
`Total distance` = sum(Distance),
`Fastest time` = min(time)
) %>%
arrange(desc(`Total distance`)) %>%
knitr::kable()
Distance | Total distance | Fastest time |
---|---|---|
6 | 246 | 28 |
9 | 54 | 45 |
10 | 20 | 59 |
8 | 8 | 43 |
5 | 5 | 25 |
Goal: table data keeping only good features of data.frame.
Pros:
variable with spaces and / $ .
Cons:
## date km time
## 1 10/27/2021 6.0 0:34:28
## 2 10/27/2021 6.0 0:33:24
## 3 10/31/2021 10.3 0:59:19
## 4 11/6/2021 8.7 0:54:00
## 5 11/13/2021 8.7 0:49:00
## 6 11/17/2021 4.6 0:25:28
## 7 11/21/2021 10.4 1:02:00
## 8 11/30/2021 6.0 0:34:08
## 9 11/30/2021 6.0 0:33:03
## 10 12/5/2021 6.2 0:35:00
## 11 12/9/2021 6.2 0:37:33
## 12 12/19/2021 6.2 0:35:22
## 13 12/22/2021 6.2 0:36:00
## 14 12/28/2021 6.2 0:35:00
## 15 12/31/2021 6.2 0:33:12
## 16 1/2/2022 6.2 0:33:43
## 17 1/12/2022 6.0 0:33:44
## 18 1/12/2022 6.0 0:33:45
## 19 1/16/2022 6.2 0:33:52
## 20 1/19/2022 6.0 0:33:38
## 21 1/19/2022 6.0 0:31:56
## 22 1/23/2022 6.2 0:34:26
## 23 1/26/2022 6.0 0:33:34
## 24 1/26/2022 6.0 0:32:19
## 25 1/30/2022 6.2 0:32:44
## 26 2/2/2022 6.0 0:33:59
## 27 2/2/2022 6.0 0:33:41
## 28 2/6/2022 6.2 0:34:35
## 29 2/9/2022 6.0 0:32:33
## 30 2/9/2022 6.0 0:32:20
## 31 2/12/2022 8.7 0:48:35
## 32 2/16/2022 6.2 0:34:02
## 33 2/19/2022 8.7 0:46:40
## 34 2/23/2022 6.0 0:34:03
## 35 2/23/2022 6.0 0:30:40
## 36 2/28/2022 8.1 0:43:18
## 37 3/2/2022 6.4 0:36:16
## 38 3/3/2022 6.4 0:38:22
## 39 3/9/2022 6.2 0:30:33
## 40 3/12/2022 8.7 0:45:20
## 41 3/16/2022 6.0 0:29:48
## 42 3/16/2022 6.0 0:29:48
## 43 3/19/2022 6.2 0:33:39
## 44 3/21/2022 6.0 0:30:57
## 45 3/21/2022 6.0 0:28:40
## 46 3/27/2022 8.7 0:47:03
## 47 3/30/2022 6.0 0:32:03
## 48 3/30/2022 6.0 0:30:03
## 49 4/3/2022 6.2 0:34:06
## 50 4/7/2022 6.0 0:29:59
## 51 4/7/2022 6.0 0:32:22
## # A tibble: 51 × 3
## date km time
## <chr> <dbl> <chr>
## 1 10/27/2021 6 0:34:28
## 2 10/27/2021 6 0:33:24
## 3 10/31/2021 10.3 0:59:19
## 4 11/6/2021 8.7 0:54:00
## 5 11/13/2021 8.7 0:49:00
## 6 11/17/2021 4.6 0:25:28
## 7 11/21/2021 10.4 1:02:00
## 8 11/30/2021 6 0:34:08
## 9 11/30/2021 6 0:33:03
## 10 12/5/2021 6.2 0:35:00
## # … with 41 more rows
## # A tibble: 50 × 2
## x y
## <int> <int>
## 1 1 51
## 2 2 52
## 3 3 53
## 4 4 54
## 5 5 55
## 6 6 56
## 7 7 57
## 8 8 58
## 9 9 59
## 10 10 60
## # … with 40 more rows
## # A tibble: 10 × 2
## x y
## <int> <int>
## 1 1 1
## 2 2 2
## 3 3 3
## 4 4 4
## 5 5 5
## 6 6 6
## 7 7 7
## 8 8 8
## 9 9 9
## 10 10 10
## # A tibble: 10 × 2
## x y
## <chr> <int>
## 1 A 1
## 2 B 2
## 3 C 3
## 4 D 4
## 5 E 5
## 6 F 6
## 7 G 7
## 8 H 8
## 9 I 9
## 10 J 10
## Warning: Unknown or uninitialised column: `ID`.
## <error/vctrs_error_subscript_oob>
## Error in `vectbl_as_col_location()`:
## ! Can't subset columns past the end.
## ℹ Location 2 doesn't exist.
## ℹ There is only 1 column.
## ---
## Backtrace:
## 1. base::tryCatch(moveHMM::prepData(as_tibble(l)), error = function(e) print(e))
## 5. moveHMM::prepData(as_tibble(l))
## 7. tibble:::`[.tbl_df`(x, i)
## 8. tibble:::vectbl_as_col_location(j, length(x), names(x), j_arg = j_arg, assign = FALSE)
## ID step angle x y
## 1 Animal1 156.8744 NA 1 1
## 2 Animal1 156.8276 0.0004610422 2 2
## 3 Animal1 156.7574 0.0006155637 3 3
## 4 Animal1 156.6639 0.0007703128 4 4
## 5 Animal1 156.5472 0.0009253809 5 5
## 6 Animal1 156.4073 0.0010808595 6 6
## 7 Animal1 156.2444 0.0012368401 7 7
## 8 Animal1 156.0586 0.0013934143 8 8
## 9 Animal1 155.8500 0.0015506736 9 9
## 10 Animal1 NA NA 10 10
## # A tibble: 10 × 2
## x y
## <int> <int>
## 1 1 1
## 2 2 2
## 3 3 3
## 4 4 4
## # … with 6 more rows
runif()
).runif()
).## # A tibble: 10 × 2
## x y
## <dbl> <dbl>
## 1 0.771 0.905
## 2 0.220 0.745
## 3 0.394 0.0169
## 4 0.0518 0.622
## # … with 6 more rows
runif()
).runif()
).runif()
).runif()
).## x y
## 1 0.8002872 0.7593188
## 2 0.3531292 0.4823047
## 3 0.5712468 0.2036041
## 4 0.2484056 0.9589408
## 5 0.1329216 0.7326067
## 6 0.6193830 0.7593188
## 7 0.3570391 0.4823047
## 8 0.3006154 0.2036041
## 9 0.7815725 0.9589408
## 10 0.5639128 0.7326067
runif()
).runif()
).## <error/tibble_error_incompatible_size>
## Error:
## ! Tibble columns must have compatible sizes.
## • Size 10: Existing data.
## • Size 5: Column at position 2.
## ℹ Only values of size one are recycled.
## ---
## Backtrace:
## 1. base::tryCatch(...)
## 5. tibble::tibble(x, y)
## 6. tibble:::tibble_quos(xs, .rows, .name_repair)
## 7. tibble:::vectbl_recycle_rows(res, first_size, j, given_col_names[[j]])
## # A tibble: 10 × 2
## x y
## <dbl> <dbl>
## 1 0.258 0
## 2 0.810 0
## 3 0.435 0
## 4 0.979 0
## # … with 6 more rows
Goal: load tabular data from delimited files (comma-separated).
Pros:
Cons:
## 'data.frame': 51 obs. of 3 variables:
## $ date: chr "10/27/2021" "10/27/2021" "10/31/2021" "11/6/2021" ...
## $ km : num 6 6 10.3 8.7 8.7 4.6 10.4 6 6 6.2 ...
## $ time: chr "0:34:28" "0:33:24" "0:59:19" "0:54:00" ...
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): date
## dbl (1): km
## time (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
readr loads directly a tibble:
## # A tibble: 51 × 3
## date km time
## <chr> <dbl> <time>
## 1 10/27/2021 6 34'28"
## 2 10/27/2021 6 33'24"
## 3 10/31/2021 10.3 59'19"
## 4 11/6/2021 8.7 54'00"
## # … with 47 more rows
readr is smart:
## Time difference of -64 secs
;
as
separator.;
as
separator.For a 24 Mb table with 29,000 rows:
test | replications | elapsed | relative | |
---|---|---|---|---|
2 | base | 10 | 4.942 | 5.291 |
3 | data.table | 10 | 0.934 | 1.000 |
1 | readr | 10 | 3.457 | 3.701 |
Before going deeper: magrittr pipe %>%.
command_1 PIPE command_2 PIPE command_3
PIPE = take output from left and pass it as input to right.
EXECUTE command_1 AND PASS ITS OUPTUT TO command_2 AND PASS ITS OUTPUT TO command_3.
round(mean(abs(x))) = x %>% abs() %>% mean() %>% round()
Before going deeper: magrittr pipe %>%.
command PIPE command PIPE command
Before going deeper: magrittr pipe %>%.
command PIPE command PIPE command
runif(10)
)
and plot it without saving objects in the RAM.runif(10)
)
and plot it without saving objects in the RAM.Goal: Standardize data manipulation.
Pros:
Cons:
Common verbs:
filter()
: retain rows conditionally.select()
: select columns and drop the rest.mutate()
: create a new column.pull()
: extract one column as a vector.summarize()
: summarize columns.group_by()
: groups tables into
sub-tables. Each groups is manipulated separately.arrange()
: sort rows by values.slice()
: select rows by ID, randomly, firsts, etc.join()
: join tables.filter()
retains rows conditionally.
filter()
retains rows conditionally.
## # A tibble: 1 × 2
## km time
## <dbl> <time>
## 1 4.6 25'28"
filter()
retains rows conditionally.
%% 2 == 0
).%% 2 == 0
).## # A tibble: 24 × 2
## km time
## <dbl> <time>
## 1 6 34'28"
## 2 6 33'24"
## 3 6 34'08"
## 4 6 33'03"
## # … with 20 more rows
%% 2 == 0
).## # A tibble: 41 × 2
## km time
## <dbl> <time>
## 1 6 34'28"
## 2 6 33'24"
## 3 6 34'08"
## 4 6 33'03"
## # … with 37 more rows
select()
selects some columns and drops the rest.
select()
selects some columns and drops the rest.
## # A tibble: 51 × 1
## km
## <dbl>
## 1 6
## 2 6
## 3 10.3
## 4 8.7
## # … with 47 more rows
You can also non-select columns with -
## # A tibble: 51 × 1
## time
## <time>
## 1 34'28"
## 2 33'24"
## 3 59'19"
## 4 54'00"
## # … with 47 more rows
What if we want to select several columns conditionally?
tidysel <- tibble(
Outcome = runif(10, 2, 3),
`Predictor 1` = runif(10, -1, 1),
`Predictor 2` = runif(10, 10, 150),
`First levels` = rep(c("A", "B"), 5),
`Second levels` = rep(c("C", "D"), 5)
)
tidysel
## # A tibble: 10 × 5
## Outcome `Predictor 1` `Predictor 2` `First levels` `Second levels`
## <dbl> <dbl> <dbl> <chr> <chr>
## 1 2.37 0.328 72.2 A C
## 2 2.78 -0.619 87.7 B D
## 3 2.12 -0.924 133. A C
## 4 2.66 0.282 64.6 B D
## # … with 6 more rows
tidyselect allows this flexibility.
## # A tibble: 10 × 2
## `Predictor 1` `Predictor 2`
## <dbl> <dbl>
## 1 0.328 72.2
## 2 -0.619 87.7
## 3 -0.924 133.
## 4 0.282 64.6
## # … with 6 more rows
## # A tibble: 10 × 2
## `First levels` `Second levels`
## <chr> <chr>
## 1 A C
## 2 B D
## 3 A C
## 4 B D
## # … with 6 more rows
For this, we need to use also the where
helper, which
checks conditions on columns.
Conditions must return TRUE
or FALSE
,
e.g. is.numeric
.
## # A tibble: 10 × 3
## Outcome `Predictor 1` `Predictor 2`
## <dbl> <dbl> <dbl>
## 1 2.37 0.328 72.2
## 2 2.78 -0.619 87.7
## 3 2.12 -0.924 133.
## 4 2.66 0.282 64.6
## # … with 6 more rows
where
is extremely useful for many verbs in
dplyr. We will see more cases later.
You can already appreciate its simplicity, though:
Application: PCAs on climate data
In base R there are several way to do the same as:
This is not safe:
This is not scalable:
The only safe and scalable way except the tidyverse one is:
mutate()
creates new columns (or modifies existing
ones).
mutate()
creates new columns (or modifies existing
ones).
d <- read_csv("kalenji.csv", show_col_types = FALSE)
d <- d %>% mutate(Date = as_date(date, format = "%m/%d/%y"))
d
## # A tibble: 51 × 4
## date km time Date
## <chr> <dbl> <time> <date>
## 1 10/27/2021 6 34'28" 2020-10-27
## 2 10/27/2021 6 33'24" 2020-10-27
## 3 10/31/2021 10.3 59'19" 2020-10-31
## 4 11/6/2021 8.7 54'00" 2020-11-06
## # … with 47 more rows
Can you compress this even more and make it more readable?
Do we need the old date column?
month()
,
week()
, and day()
.month()
,
week()
, and day()
.d <- d %>%
mutate(
month = month(date, label = TRUE, abbr = FALSE),
week = week(date),
day = day(date)
)
d
## # A tibble: 51 × 6
## date km time month week day
## <date> <dbl> <time> <ord> <dbl> <int>
## 1 2020-10-27 6 34'28" October 43 27
## 2 2020-10-27 6 33'24" October 43 27
## 3 2020-10-31 10.3 59'19" October 44 31
## 4 2020-11-06 8.7 54'00" November 45 6
## # … with 47 more rows
mutate()
can make use of tidyselect, but within
the across() helper.
The syntax always looks like: mutate(across(where( condition ), fn)).
## # A tibble: 10 × 5
## Outcome `Predictor 1` `Predictor 2` `First levels` `Second levels`
## <dbl> <dbl> <dbl> <chr> <chr>
## 1 2.37 0.3 72.2 A C
## 2 2.78 -0.6 87.7 B D
## 3 2.12 -0.9 133. A C
## 4 2.66 0.3 64.6 B D
## # … with 6 more rows
## # A tibble: 10 × 5
## Outcome `Predictor 1` `Predictor 2` `First levels` `Second levels`
## <dbl> <dbl> <dbl> <chr> <chr>
## 1 2.37 0.328 72.2 a c
## 2 2.78 -0.619 87.7 b d
## 3 2.12 -0.924 133. a c
## 4 2.66 0.282 64.6 b d
## # … with 6 more rows
pull()
extracts one column as vector.
pull()
extracts one column as vector.
## [1] 6.0 6.0 10.3 8.7 8.7 4.6 10.4 6.0 6.0 6.2 6.2 6.2 6.2 6.2 6.2
## [16] 6.2 6.0 6.0 6.2 6.0 6.0 6.2 6.0 6.0 6.2 6.0 6.0 6.2 6.0 6.0
## [31] 8.7 6.2 8.7 6.0 6.0 8.1 6.4 6.4 6.2 8.7 6.0 6.0 6.2 6.0 6.0
## [46] 8.7 6.0 6.0 6.2 6.0 6.0
table()
of month and sort it by
count.table()
of month and sort it by
count.## .
## May June July August September April October November
## 0 0 0 0 0 3 3 6
## December January February March
## 6 10 11 12
summarize()
summarizes columns.
summarize()
summarizes columns.
d %>% summarize(
`shortest distance` = min(km),
`average distance` = mean(km),
`maximum distance` = max(km)
)
## # A tibble: 1 × 3
## `shortest distance` `average distance` `maximum distance`
## <dbl> <dbl> <dbl>
## 1 4.6 6.58 10.4
## # A tibble: 1 × 2
## `Total km` `Total time`
## <dbl> <dbl>
## 1 335. 30.7
tidysel %>% summarize(
across(contains("Pred"), #tidyselect
list(min = min, max = max), #summary stats
.names = "{col} : {fn}") #names of columns
)
## # A tibble: 1 × 4
## `Predictor 1 : min` `Predictor 1 : max` `Predictor 2 : min` `Predictor 2 : m…`
## <dbl> <dbl> <dbl> <dbl>
## 1 -0.924 0.679 24.2 150.
d %>%
summarize(across(where(is.numeric),
list(mean = mean, median = median),
.names = "{fn} of {col}"))
## # A tibble: 1 × 6
## `mean of km` `median of km` `mean of week` `median of week` `mean of day`
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.58 6.2 19.6 11 17.1
## # … with 1 more variable: `median of day` <int>
dplyr operations can also be performed on groups within tables by
using group_by()
.
For instance, the sum of all distances for the whole kalenji dataset is:
## # A tibble: 1 × 1
## `Total distance`
## <dbl>
## 1 335.
To get the summary sum for each month separately, we just need to specify month as the grouping variable. dplyr would then perform the next operation on each group separately:
## # A tibble: 7 × 2
## month `Total distance`
## <ord> <dbl>
## 1 January 60.8
## 2 February 73.9
## 3 March 78.6
## 4 April 18.2
## # … with 3 more rows
## # A tibble: 7 × 2
## month n
## <ord> <int>
## 1 January 10
## 2 February 11
## 3 March 12
## 4 April 3
## # … with 3 more rows
## # A tibble: 7 × 2
## month n
## <ord> <int>
## 1 January 10
## 2 February 11
## 3 March 12
## 4 April 3
## # … with 3 more rows
d %>%
group_by(month, week) %>%
summarize(`Total km` = sum(km)) %>%
filter(week > 3) #simply to show you why week and month
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
## # A tibble: 24 × 3
## # Groups: month [7]
## month week `Total km`
## <ord> <dbl> <dbl>
## 1 January 4 18.2
## 2 January 5 6.2
## 3 February 5 12
## 4 February 6 18.2
## # … with 20 more rows
The output is still grouped.
If you don’t need groups, ungroup()
the table, or you
may get unexpected results later on.
or
\(\,\)
\(\longrightarrow\)
Output tables has:
full_join(left_table, right_table)
- all levelsinner_join(left_table, right_table)
- only levels in
bothleft_join(left_table, right_table)
- only levels in
left_tableright_join(left_table, right_table)
- only levels in
right_table## Joining, by = "Group"
## # A tibble: 4 × 3
## Group x y
## <chr> <dbl> <dbl>
## 1 A 0.0453 NA
## 2 B 0.586 1.05
## 3 C 0.770 -0.852
## 4 D NA 1.11
## Joining, by = "Group"
## # A tibble: 2 × 3
## Group x y
## <chr> <dbl> <dbl>
## 1 B 0.586 1.05
## 2 C 0.770 -0.852
## Joining, by = "Group"
## # A tibble: 3 × 3
## Group x y
## <chr> <dbl> <dbl>
## 1 A 0.0453 NA
## 2 B 0.586 1.05
## 3 C 0.770 -0.852
## Joining, by = "Group"
## # A tibble: 3 × 3
## Group x y
## <chr> <dbl> <dbl>
## 1 B 0.586 1.05
## 2 C 0.770 -0.852
## 3 D NA 1.11
Joining requires (at least) one joining variable. Avoid ambiguity: specify it.
## # A tibble: 3 × 3
## Group x y
## <chr> <dbl> <dbl>
## 1 A 0.0453 NA
## 2 B 0.586 1.05
## 3 C 0.770 -0.852
You can join by multiple variables.
left_table <- tibble(`First level` = sample(LETTERS, 100, replace = TRUE),
`Second level` = sample(letters, 100, replace = TRUE),
x = rnorm(100))
right_table <- tibble(`First level` = sample(LETTERS, 100, replace = TRUE),
`Second level` = sample(letters, 100, replace = TRUE),
y = rnorm(100))
inner_join(left_table, right_table, by = c("First level", "Second level"))
## # A tibble: 16 × 4
## `First level` `Second level` x y
## <chr> <chr> <dbl> <dbl>
## 1 U s 1.08 0.697
## 2 K m -0.106 -0.182
## 3 J y -1.26 -0.588
## 4 F x 0.411 1.03
## # … with 12 more rows
Variables do not need to have the same name.
left_table <- tibble(`First level` = sample(LETTERS, 100, replace = TRUE),
`Second level` = sample(letters, 100, replace = TRUE),
x = rnorm(100))
right_table <- tibble(`First group` = sample(LETTERS, 100, replace = TRUE),
`Second group` = sample(letters, 100, replace = TRUE),
y = rnorm(100))
inner_join(left_table, right_table,
by = c("First level" = "First group",
"Second level" = "Second group"))
## # A tibble: 26 × 4
## `First level` `Second level` x y
## <chr> <chr> <dbl> <dbl>
## 1 L h -0.764 0.856
## 2 D j -0.447 -0.149
## 3 F f 1.58 -0.118
## 4 T u -0.684 1.25
## # … with 22 more rows
I have the name of the routes.
## # A tibble: 6 × 2
## distance `route name`
## <dbl> <chr>
## 1 4.6 stadium
## 2 6 work
## 3 6.4 first bridge
## 4 8.7 second bridge
## # … with 2 more rows
d %>%
left_join(routes, by = c("km" = "distance")) %>%
group_by(`route name`) %>%
summarise(`Total km` = sum(km)) %>%
arrange(desc(`Total km`)) %>%
mutate(`route name` = ifelse(is.na(`route name`), "no name", `route name`))
## # A tibble: 6 × 2
## `route name` `Total km`
## <chr> <dbl>
## 1 work 144
## 2 no name 101.
## 3 second bridge 52.2
## 4 train lake 20.7
## # … with 2 more rows
## New names:
## • `Avg Run Cadence` -> `Avg Run Cadence...10`
## • `Max Run Cadence` -> `Max Run Cadence...11`
## • `Avg Run Cadence` -> `Avg Run Cadence...20`
## • `Max Run Cadence` -> `Max Run Cadence...21`
## # A tibble: 74 × 41
## `Activity Type` Date Favorite Title Distance Calories Time
## <chr> <dttm> <lgl> <chr> <dbl> <dbl> <time>
## 1 Running 2022-09-08 08:19:19 FALSE Stara Z… 5.35 484 36'08"
## 2 Running 2022-09-04 10:48:08 FALSE Leipzig… 8.41 698 47'50"
## 3 Running 2022-09-02 07:04:01 FALSE Leipzig… 6.76 524 39'15"
## 4 Running 2022-09-01 16:42:50 FALSE Leipzig… 5.7 457 31'20"
## # … with 70 more rows, and 34 more variables: `Avg HR` <dbl>, `Max HR` <dbl>,
## # `Avg Run Cadence...10` <dbl>, `Max Run Cadence...11` <dbl>,
## # `Avg Pace` <chr>, `Best Pace` <chr>, `Total Ascent` <chr>,
## # `Total Descent` <chr>, `Avg Stride Length` <dbl>,
## # `Avg Vertical Ratio` <dbl>, `Avg Vertical Oscillation` <dbl>,
## # `Avg Ground Contact Time` <dbl>, `Avg Run Cadence...20` <dbl>,
## # `Max Run Cadence...21` <dbl>, `Training Stress Score®` <dbl>, …
We are interested only in Activity Type = Running and columns: Date, Distance, Time, Avg HR.
pairs()
.garmin <- read_csv("garmin.csv", show_col_types = FALSE) %>%
filter(`Activity Type` == "Running") %>%
select(Date, Distance, Time, `Avg HR`)
## New names:
## • `Avg Run Cadence` -> `Avg Run Cadence...10`
## • `Max Run Cadence` -> `Max Run Cadence...11`
## • `Avg Run Cadence` -> `Avg Run Cadence...20`
## • `Max Run Cadence` -> `Max Run Cadence...21`
pairs()
.For each dataset:
round() %>% summarize() %>% join()
.
For each dataset:
round() %>% summarize() %>% join()
.
However, one round() %>% summarize()
can be nested
into join()
:
avg_speed <- kalenji %>%
mutate(Distance = round(Distance)) %>%
group_by(Distance) %>%
summarize(`Average Speed` = mean(Speed)) %>%
full_join(
garmin %>%
mutate(Distance = round(Distance)) %>%
group_by(Distance) %>%
summarize(`Average Speed` = mean(Speed)),
by = "Distance",
suffix = c(" - kalenji", " - garmin")
)
Line indentations (should) matter
## # A tibble: 10 × 3
## Distance `Average Speed - kalenji` `Average Speed - garmin`
## <dbl> <dbl> <dbl>
## 1 5 10.8 10.2
## 2 6 11.0 10.9
## 3 8 11.2 10.5
## 4 9 10.8 9.68
## # … with 6 more rows
avg_speed <- kalenji %>%
mutate(Distance = round(Distance)) %>%
group_by(Distance) %>%
summarize(`Average Speed` = mean(Speed)) %>%
full_join(
garmin %>%
mutate(Distance = round(Distance)) %>%
group_by(Distance) %>%
summarize(`Average Speed` = mean(Speed)),
by = "Distance",
suffix = c(" - kalenji", " - garmin")
)
avg_speed <- kalenji %>%
mutate(Distance = round(Distance)) %>% group_by(Distance) %>%
summarize(
`Average Speed` = mean(Speed)) %>%
full_join(garmin %>%
mutate(Distance = round(Distance)) %>% group_by(Distance) %>%
summarize(`Average Speed` = mean(Speed)), by = "Distance",
suffix = c(
" - kalenji",
" - garmin"
))
Nel mezzo
del
cammin di nostra vita, mi
ritrovai perunaselvaoscura
che
la
diritta
via era smarrita.
Nel mezzo del cammin di nostra vita,
mi ritrovai per una selva oscura
che la diritta via era smarrita.
One line = one operation
One block = one coherent prodecure
lines -> blocks -> pipeline
All manipulations were done on the fly, i.e. without saving
objects to memory unless specifically stated:
avg_speed <-
is the only new object. Old tables are
still the same:
## [1] 5.35 8.41 6.76 5.70 5.70 2.00 3.82 5.69 10.54 5.84 5.72 5.71
## [13] 5.67 9.52 6.01 5.81 5.74 6.03 5.78 7.04 6.06 5.83 5.84 5.79
## [25] 5.81 8.08 6.15 5.38 5.83 5.77 4.38 1.40 9.29 5.71 5.72 5.80
## [37] 6.07 7.02 6.53 6.61 8.42 10.49 6.04 5.96 6.97 8.42 6.01 5.73
## [49] 5.78 5.58 5.04 6.01 5.71 4.50 10.01 5.75 5.84 5.91 5.81 10.10
## [61] 6.00 6.07 6.21 6.05 5.66 5.71 7.89
Let’s say we want to compare Average Speed between the two
methods, e.g. boxplot()
.
How would you manipulate this table?
## # A tibble: 10 × 3
## Distance `Average Speed - kalenji` `Average Speed - garmin`
## <dbl> <dbl> <dbl>
## 1 5 10.8 10.2
## 2 6 11.0 10.9
## 3 8 11.2 10.5
## 4 9 10.8 9.68
## # … with 6 more rows
Tidy data: one row = one observation.
Goal: To achieve tidy data.
Pros:
Cons:
The most frequent use of tidyr is for pivotting: from long to wide and back again.
The most frequent use of tidyr is for pivotting: from long to wide and back again.
Long table
## # A tibble: 3 × 2
## Group x
## <chr> <dbl>
## 1 A -0.418
## 2 B -0.754
## 3 C -0.222
pivot_longer()
: wide to long.
## # A tibble: 3 × 2
## Group x
## <chr> <dbl>
## 1 A -0.613
## 2 B -1.78
## 3 C 1.51
pivot_longer()
: wide to long.
## # A tibble: 3 × 2
## Group x
## <chr> <dbl>
## 1 A -0.613
## 2 B -1.78
## 3 C 1.51
pivot_wider()
: long to wide.
## # A tibble: 1 × 3
## A B C
## <dbl> <dbl> <dbl>
## 1 -0.418 -0.754 -0.222
## # A tibble: 10 × 3
## Distance `Average Speed - kalenji` `Average Speed - garmin`
## <dbl> <dbl> <dbl>
## 1 5 10.8 10.2
## 2 6 11.0 10.9
## 3 8 11.2 10.5
## 4 9 10.8 9.68
## # … with 6 more rows
avg_speed
table longer: three columns
Distance, Speed, and Method.avg_speed
table longer: three columns
Distance, Speed, and Method.avg_speed
table longer: three columns
Distance, Speed, and Method.avg_speed
table longer: three columns
Distance, Speed, and Method.d <- read_csv("kalenji.csv", show_col_types = FALSE) %>%
mutate(date = as_date(date, format = "%m/%d/%y"),
week = week(date),
speed = km / ( hour(time) + minute(time) / 60 )) %>%
group_by(week) %>%
summarize(`Total km` = sum(km),
`Avg speed` = mean(speed)) %>%
mutate(method = "Kalenji") %>%
bind_rows(
read_csv("garmin.csv", show_col_types = FALSE) %>%
filter(`Activity Type` == "Running") %>%
mutate(Date = as_date(Date),
week = week(Date),
speed = Distance / ( hour(Time) + minute(Time) / 60 )) %>%
group_by(week) %>%
summarize(`Total km` = sum(Distance),
`Avg speed` = mean(speed)) %>%
mutate(method = "Garmin")
) %>%
mutate(week = ifelse(week > 40, week - 53, week)) #these are last year weeks
## New names:
## • `Avg Run Cadence` -> `Avg Run Cadence...10`
## • `Max Run Cadence` -> `Max Run Cadence...11`
## • `Avg Run Cadence` -> `Avg Run Cadence...20`
## • `Max Run Cadence` -> `Max Run Cadence...21`
par(mfrow = c(1, 2))
with(d, plot(week, `Total km`, col = as.factor(method)))
with(d, plot(week, `Avg speed`, col = as.factor(method)))
I probably wasn’t so accurate with the Kalenji watch.