Data manipulation in R with metan
Utilities for rows and columns
Add columns and rows
The functions add_cols()
and add_rows()
can be used to add columns and rows, respectively to a data frame.
library(metan)
add_cols(data_ge,
ROW_ID = 1:420)
# # A tibble: 420 x 6
# ENV GEN REP GY HM ROW_ID
# <fct> <fct> <fct> <dbl> <dbl> <int>
# 1 E1 G1 1 2.17 44.9 1
# 2 E1 G1 2 2.50 46.9 2
# 3 E1 G1 3 2.43 47.8 3
# 4 E1 G2 1 3.21 45.2 4
# 5 E1 G2 2 2.93 45.3 5
# 6 E1 G2 3 2.56 45.5 6
# 7 E1 G3 1 2.77 46.7 7
# 8 E1 G3 2 3.62 43.2 8
# 9 E1 G3 3 2.28 47.8 9
# 10 E1 G4 1 2.36 47.9 10
# # ... with 410 more rows
It is also possible to add a column based on existing data. Note that the arguments .after
and .before
are used to select the position of the new column(s). This is particularly useful to put variables of the same category together.
add_cols(data_ge,
GY2 = GY^2,
.after = "GY")
# # A tibble: 420 x 6
# ENV GEN REP GY GY2 HM
# <fct> <fct> <fct> <dbl> <dbl> <dbl>
# 1 E1 G1 1 2.17 4.70 44.9
# 2 E1 G1 2 2.50 6.27 46.9
# 3 E1 G1 3 2.43 5.89 47.8
# 4 E1 G2 1 3.21 10.3 45.2
# 5 E1 G2 2 2.93 8.60 45.3
# 6 E1 G2 3 2.56 6.58 45.5
# 7 E1 G3 1 2.77 7.67 46.7
# 8 E1 G3 2 3.62 13.1 43.2
# 9 E1 G3 3 2.28 5.18 47.8
# 10 E1 G4 1 2.36 5.57 47.9
# # ... with 410 more rows
Select or remove columns and rows
The functions select_cols()
and select_rows()
can be used to select columns and rows, respectively from a data frame.
select_cols(data_ge2, ENV, GEN)
# # A tibble: 156 x 2
# ENV GEN
# <fct> <fct>
# 1 A1 H1
# 2 A1 H1
# 3 A1 H1
# 4 A1 H10
# 5 A1 H10
# 6 A1 H10
# 7 A1 H11
# 8 A1 H11
# 9 A1 H11
# 10 A1 H12
# # ... with 146 more rows
select_rows(data_ge2, 1:3)
# # A tibble: 3 x 18
# ENV GEN REP PH EH EP EL ED CL CD CW KW NR
# <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 A1 H1 1 2.61 1.71 0.658 16.1 52.2 28.1 16.3 25.1 217. 15.6
# 2 A1 H1 2 2.87 1.76 0.628 14.2 50.3 27.6 14.5 21.4 184. 16
# 3 A1 H1 3 2.68 1.58 0.591 16.0 50.7 28.4 16.4 24.0 208. 17.2
# # ... with 5 more variables: NKR <dbl>, CDED <dbl>, PERK <dbl>, TKW <dbl>,
# # NKE <dbl>
Numeric columns can be selected quickly by using the function select_numeric_cols()
. Non-numeric columns are selected with select_non_numeric_cols()
select_numeric_cols(data_ge2)
# # A tibble: 156 x 15
# PH EH EP EL ED CL CD CW KW NR NKR CDED PERK
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2.61 1.71 0.658 16.1 52.2 28.1 16.3 25.1 217. 15.6 36.6 0.538 89.6
# 2 2.87 1.76 0.628 14.2 50.3 27.6 14.5 21.4 184. 16 31.4 0.551 89.5
# 3 2.68 1.58 0.591 16.0 50.7 28.4 16.4 24.0 208. 17.2 31.8 0.561 89.7
# 4 2.83 1.64 0.581 16.7 54.1 31.7 17.4 26.2 194. 15.6 32.8 0.586 87.9
# 5 2.79 1.71 0.616 14.9 52.7 32.0 15.5 20.7 176. 17.6 28 0.607 89.7
# 6 2.72 1.51 0.554 16.7 52.7 30.4 17.5 26.8 207. 16.8 32.8 0.577 88.5
# 7 2.75 1.51 0.549 17.4 51.7 30.6 18.0 26.2 217. 16.8 34.6 0.594 89.1
# 8 2.72 1.56 0.573 16.7 47.2 28.7 17.2 24.1 181. 13.6 34.4 0.608 88.3
# 9 2.77 1.67 0.600 15.8 47.9 27.6 16.4 20.5 166. 15.2 34.8 0.576 89.0
# 10 2.73 1.54 0.563 14.9 47.5 28.2 15.5 20.1 161. 14.8 31.6 0.597 88.7
# # ... with 146 more rows, and 2 more variables: TKW <dbl>, NKE <dbl>
select_non_numeric_cols(data_ge2)
# # A tibble: 156 x 3
# ENV GEN REP
# <fct> <fct> <fct>
# 1 A1 H1 1
# 2 A1 H1 2
# 3 A1 H1 3
# 4 A1 H10 1
# 5 A1 H10 2
# 6 A1 H10 3
# 7 A1 H11 1
# 8 A1 H11 2
# 9 A1 H11 3
# 10 A1 H12 1
# # ... with 146 more rows
We can select the first or last columns quickly with select_first_col()
and select_last_col()
, respectively.
select_first_col(data_ge2)
# # A tibble: 156 x 1
# ENV
# <fct>
# 1 A1
# 2 A1
# 3 A1
# 4 A1
# 5 A1
# 6 A1
# 7 A1
# 8 A1
# 9 A1
# 10 A1
# # ... with 146 more rows
select_last_col(data_ge2)
# # A tibble: 156 x 1
# NKE
# <dbl>
# 1 521.
# 2 494.
# 3 565.
# 4 519.
# 5 502.
# 6 525.
# 7 575
# 8 501.
# 9 513.
# 10 480.
# # ... with 146 more rows
To remove columns or rows, use remove_cols()
and remove_rows()
.
remove_cols(data_ge2, ENV, GEN)
# # A tibble: 156 x 16
# REP PH EH EP EL ED CL CD CW KW NR NKR CDED
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 2.61 1.71 0.658 16.1 52.2 28.1 16.3 25.1 217. 15.6 36.6 0.538
# 2 2 2.87 1.76 0.628 14.2 50.3 27.6 14.5 21.4 184. 16 31.4 0.551
# 3 3 2.68 1.58 0.591 16.0 50.7 28.4 16.4 24.0 208. 17.2 31.8 0.561
# 4 1 2.83 1.64 0.581 16.7 54.1 31.7 17.4 26.2 194. 15.6 32.8 0.586
# 5 2 2.79 1.71 0.616 14.9 52.7 32.0 15.5 20.7 176. 17.6 28 0.607
# 6 3 2.72 1.51 0.554 16.7 52.7 30.4 17.5 26.8 207. 16.8 32.8 0.577
# 7 1 2.75 1.51 0.549 17.4 51.7 30.6 18.0 26.2 217. 16.8 34.6 0.594
# 8 2 2.72 1.56 0.573 16.7 47.2 28.7 17.2 24.1 181. 13.6 34.4 0.608
# 9 3 2.77 1.67 0.600 15.8 47.9 27.6 16.4 20.5 166. 15.2 34.8 0.576
# 10 1 2.73 1.54 0.563 14.9 47.5 28.2 15.5 20.1 161. 14.8 31.6 0.597
# # ... with 146 more rows, and 3 more variables: PERK <dbl>, TKW <dbl>,
# # NKE <dbl>
remove_rows(data_ge2, 1:2, 5:8)
# # A tibble: 150 x 18
# ENV GEN REP PH EH EP EL ED CL CD CW KW NR
# <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 A1 H1 3 2.68 1.58 0.591 16.0 50.7 28.4 16.4 24.0 208. 17.2
# 2 A1 H10 1 2.83 1.64 0.581 16.7 54.1 31.7 17.4 26.2 194. 15.6
# 3 A1 H11 3 2.77 1.67 0.600 15.8 47.9 27.6 16.4 20.5 166. 15.2
# 4 A1 H12 1 2.73 1.54 0.563 14.9 47.5 28.2 15.5 20.1 161. 14.8
# 5 A1 H12 2 2.56 1.56 0.616 15.7 49.9 29.9 16.2 24.0 188. 17.2
# 6 A1 H12 3 2.79 1.53 0.546 15.0 52.7 31.4 15.2 32.9 193. 20
# 7 A1 H13 1 2.74 1.60 0.586 14.6 54.0 32.5 15.1 31.5 205. 20
# 8 A1 H13 2 2.64 1.37 0.517 14.8 53.7 31.0 15.5 31.1 239. 20.4
# 9 A1 H13 3 2.93 1.77 0.602 14.9 52.7 30.1 15.8 31.3 212. 15.6
# 10 A1 H2 1 2.55 1.22 0.481 15.1 51.7 27.7 15.3 23.7 198. 16.4
# # ... with 140 more rows, and 5 more variables: NKR <dbl>, CDED <dbl>,
# # PERK <dbl>, TKW <dbl>, NKE <dbl>
Concatenating columns
The function concatetate()
can be used to concatenate multiple columns of a data frame. It return a data frame with all the original columns in .data
plus the concatenated variable, after the last column (by default), or at any position when using the arguments .before
or .after
.
concatenate(data_ge, ENV, GEN, REP, .after = "REP")
# # A tibble: 420 x 6
# ENV GEN REP new_var GY HM
# <fct> <fct> <fct> <chr> <dbl> <dbl>
# 1 E1 G1 1 E1_G1_1 2.17 44.9
# 2 E1 G1 2 E1_G1_2 2.50 46.9
# 3 E1 G1 3 E1_G1_3 2.43 47.8
# 4 E1 G2 1 E1_G2_1 3.21 45.2
# 5 E1 G2 2 E1_G2_2 2.93 45.3
# 6 E1 G2 3 E1_G2_3 2.56 45.5
# 7 E1 G3 1 E1_G3_1 2.77 46.7
# 8 E1 G3 2 E1_G3_2 3.62 43.2
# 9 E1 G3 3 E1_G3_3 2.28 47.8
# 10 E1 G4 1 E1_G4_1 2.36 47.9
# # ... with 410 more rows
To drop the existing variables and keep only the concatenated column, use the argument drop = TRUE
. To use concatenate()
within a given function like add_cols()
use the argument pull = TRUE
to pull out the results to a vector.
concatenate(data_ge, ENV, GEN, REP, drop = TRUE) %>% head()
# # A tibble: 6 x 1
# new_var
# <chr>
# 1 E1_G1_1
# 2 E1_G1_2
# 3 E1_G1_3
# 4 E1_G2_1
# 5 E1_G2_2
# 6 E1_G2_3
concatenate(data_ge, ENV, GEN, REP, pull = TRUE) %>% head()
# [1] "E1_G1_1" "E1_G1_2" "E1_G1_3" "E1_G2_1" "E1_G2_2" "E1_G2_3"
To check if a column exists in a data frame, use column_exists()
column_exists(data_ge, "ENV")
# [1] TRUE
Getting levels
To get the levels and the size of the levels of a factor, the functions get_levels()
and get_level_size()
can be used.
get_levels(data_ge, ENV)
# [1] "E1" "E10" "E11" "E12" "E13" "E14" "E2" "E3" "E4" "E5" "E6" "E7"
# [13] "E8" "E9"
get_level_size(data_ge, ENV)
# E1 E10 E11 E12 E13 E14 E2 E3 E4 E5 E6 E7 E8 E9
# 30 30 30 30 30 30 30 30 30 30 30 30 30 30
Utilities for numbers and strings
Rounding whole data frames
The function round_cols()
round a selected column or a whole data frame to the specified number of decimal places (default 0). If no variables are informed, then all numeric variables are rounded.
round_cols(data_ge2)
# # A tibble: 156 x 18
# ENV GEN REP PH EH EP EL ED CL CD CW KW NR
# <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 A1 H1 1 2.61 1.71 0.66 16.1 52.2 28.1 16.3 25.1 217. 15.6
# 2 A1 H1 2 2.87 1.76 0.63 14.2 50.3 27.6 14.5 21.4 184. 16
# 3 A1 H1 3 2.68 1.58 0.59 16.0 50.7 28.4 16.4 24.0 208. 17.2
# 4 A1 H10 1 2.83 1.64 0.580 16.7 54.0 31.7 17.4 26.2 194. 15.6
# 5 A1 H10 2 2.79 1.71 0.62 14.9 52.7 32.0 15.5 20.7 176. 17.6
# 6 A1 H10 3 2.72 1.51 0.55 16.7 52.7 30.4 17.5 26.8 207. 16.8
# 7 A1 H11 1 2.75 1.51 0.55 17.4 51.7 30.6 18.0 26.2 217. 16.8
# 8 A1 H11 2 2.72 1.56 0.570 16.7 47.2 28.7 17.2 24.1 181. 13.6
# 9 A1 H11 3 2.77 1.67 0.6 15.8 47.9 27.6 16.4 20.5 166. 15.2
# 10 A1 H12 1 2.73 1.54 0.56 14.9 47.5 28.2 15.5 20.1 161. 14.8
# # ... with 146 more rows, and 5 more variables: NKR <dbl>, CDED <dbl>,
# # PERK <dbl>, TKW <dbl>, NKE <dbl>
Alternatively, select variables to round.
round_cols(data_ge2, PH, EP, digits = 1)
# # A tibble: 156 x 18
# ENV GEN REP PH EH EP EL ED CL CD CW KW NR
# <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 A1 H1 1 2.6 1.71 0.7 16.1 52.2 28.1 16.3 25.1 217. 15.6
# 2 A1 H1 2 2.9 1.76 0.6 14.2 50.3 27.6 14.5 21.4 184. 16
# 3 A1 H1 3 2.7 1.58 0.6 16.0 50.7 28.4 16.4 24.0 208. 17.2
# 4 A1 H10 1 2.8 1.64 0.6 16.7 54.1 31.7 17.4 26.2 194. 15.6
# 5 A1 H10 2 2.8 1.71 0.6 14.9 52.7 32.0 15.5 20.7 176. 17.6
# 6 A1 H10 3 2.7 1.51 0.6 16.7 52.7 30.4 17.5 26.8 207. 16.8
# 7 A1 H11 1 2.8 1.51 0.5 17.4 51.7 30.6 18.0 26.2 217. 16.8
# 8 A1 H11 2 2.7 1.56 0.6 16.7 47.2 28.7 17.2 24.1 181. 13.6
# 9 A1 H11 3 2.8 1.67 0.6 15.8 47.9 27.6 16.4 20.5 166. 15.2
# 10 A1 H12 1 2.7 1.54 0.6 14.9 47.5 28.2 15.5 20.1 161. 14.8
# # ... with 146 more rows, and 5 more variables: NKR <dbl>, CDED <dbl>,
# # PERK <dbl>, TKW <dbl>, NKE <dbl>
Extracting and replacing numbers
The functions extract_number()
, and replace_number()
can be used to extract or replace numbers. As an example, we will extract the number of each genotype in data_g
. By default, the extracted numbers are put as a new variable called new_var
after the last column of the data.
extract_number(data_ge, GEN, .after = "GEN")
# # A tibble: 420 x 6
# ENV GEN new_var REP GY HM
# <fct> <fct> <dbl> <fct> <dbl> <dbl>
# 1 E1 G1 1 1 2.17 44.9
# 2 E1 G1 1 2 2.50 46.9
# 3 E1 G1 1 3 2.43 47.8
# 4 E1 G2 2 1 3.21 45.2
# 5 E1 G2 2 2 2.93 45.3
# 6 E1 G2 2 3 2.56 45.5
# 7 E1 G3 3 1 2.77 46.7
# 8 E1 G3 3 2 3.62 43.2
# 9 E1 G3 3 3 2.28 47.8
# 10 E1 G4 4 1 2.36 47.9
# # ... with 410 more rows
If the argument drop
is set to TRUE
then, only the new variable is kept and all others are dropped.
extract_number(data_ge, GEN, drop = TRUE)
# # A tibble: 420 x 1
# new_var
# <dbl>
# 1 1
# 2 1
# 3 1
# 4 2
# 5 2
# 6 2
# 7 3
# 8 3
# 9 3
# 10 4
# # ... with 410 more rows
To pull out the results into a vector, use the argument pull = TRUE
. This is particularly useful when extract_*
or replace_*
are used within a function like add_cols()
.
extract_number(data_ge, GEN, pull = TRUE)
# [1] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7 8 8 8 9
# [26] 9 9 10 10 10 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7
# [51] 7 8 8 8 9 9 9 10 10 10 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5
# [76] 6 6 6 7 7 7 8 8 8 9 9 9 10 10 10 1 1 1 2 2 2 3 3 3 4
# [101] 4 4 5 5 5 6 6 6 7 7 7 8 8 8 9 9 9 10 10 10 1 1 1 2 2
# [126] 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7 8 8 8 9 9 9 10 10 10
# [151] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7 8 8 8 9
# [176] 9 9 10 10 10 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7
# [201] 7 8 8 8 9 9 9 10 10 10 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5
# [226] 6 6 6 7 7 7 8 8 8 9 9 9 10 10 10 1 1 1 2 2 2 3 3 3 4
# [251] 4 4 5 5 5 6 6 6 7 7 7 8 8 8 9 9 9 10 10 10 1 1 1 2 2
# [276] 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7 8 8 8 9 9 9 10 10 10
# [301] 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7 8 8 8 9
# [326] 9 9 10 10 10 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7
# [351] 7 8 8 8 9 9 9 10 10 10 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5
# [376] 6 6 6 7 7 7 8 8 8 9 9 9 10 10 10 1 1 1 2 2 2 3 3 3 4
# [401] 4 4 5 5 5 6 6 6 7 7 7 8 8 8 9 9 9 10 10 10
To replace numbers of a given column with a specified replacement, use replace_number()
. By default, numbers are replaced with “”. The argument drop
and pull
can also be used, as shown above.
replace_number(data_ge, GEN)
# # A tibble: 420 x 6
# ENV GEN REP GY HM new_var
# <fct> <fct> <fct> <dbl> <dbl> <chr>
# 1 E1 G1 1 2.17 44.9 G
# 2 E1 G1 2 2.50 46.9 G
# 3 E1 G1 3 2.43 47.8 G
# 4 E1 G2 1 3.21 45.2 G
# 5 E1 G2 2 2.93 45.3 G
# 6 E1 G2 3 2.56 45.5 G
# 7 E1 G3 1 2.77 46.7 G
# 8 E1 G3 2 3.62 43.2 G
# 9 E1 G3 3 2.28 47.8 G
# 10 E1 G4 1 2.36 47.9 G
# # ... with 410 more rows
replace_number(data_ge,
var = REP,
pattern = "1",
replacement = "Rep_1",
new_var = R_ONE,
.after = "REP")
# # A tibble: 420 x 6
# ENV GEN REP R_ONE GY HM
# <fct> <fct> <fct> <chr> <dbl> <dbl>
# 1 E1 G1 1 Rep_1 2.17 44.9
# 2 E1 G1 2 2 2.50 46.9
# 3 E1 G1 3 3 2.43 47.8
# 4 E1 G2 1 Rep_1 3.21 45.2
# 5 E1 G2 2 2 2.93 45.3
# 6 E1 G2 3 3 2.56 45.5
# 7 E1 G3 1 Rep_1 2.77 46.7
# 8 E1 G3 2 2 3.62 43.2
# 9 E1 G3 3 3 2.28 47.8
# 10 E1 G4 1 Rep_1 2.36 47.9
# # ... with 410 more rows
Extracting, replacing, and removing strings
The functions extract_string()
, and replace_string()
are used in the same context of extract_number()
, and replace_number()
, but for handling with strings.
extract_string(data_ge, GEN)
# # A tibble: 420 x 6
# ENV GEN REP GY HM new_var
# <fct> <fct> <fct> <dbl> <dbl> <chr>
# 1 E1 G1 1 2.17 44.9 G
# 2 E1 G1 2 2.50 46.9 G
# 3 E1 G1 3 2.43 47.8 G
# 4 E1 G2 1 3.21 45.2 G
# 5 E1 G2 2 2.93 45.3 G
# 6 E1 G2 3 2.56 45.5 G
# 7 E1 G3 1 2.77 46.7 G
# 8 E1 G3 2 3.62 43.2 G
# 9 E1 G3 3 2.28 47.8 G
# 10 E1 G4 1 2.36 47.9 G
# # ... with 410 more rows
To replace strings, we can use the function replace_strings()
.
replace_string(data_ge,
var = GEN,
new_var = GENOTYPE,
replacement = "GENOTYPE_",
.after = "GEN")
# # A tibble: 420 x 6
# ENV GEN GENOTYPE REP GY HM
# <fct> <fct> <chr> <fct> <dbl> <dbl>
# 1 E1 G1 GENOTYPE_1 1 2.17 44.9
# 2 E1 G1 GENOTYPE_1 2 2.50 46.9
# 3 E1 G1 GENOTYPE_1 3 2.43 47.8
# 4 E1 G2 GENOTYPE_2 1 3.21 45.2
# 5 E1 G2 GENOTYPE_2 2 2.93 45.3
# 6 E1 G2 GENOTYPE_2 3 2.56 45.5
# 7 E1 G3 GENOTYPE_3 1 2.77 46.7
# 8 E1 G3 GENOTYPE_3 2 3.62 43.2
# 9 E1 G3 GENOTYPE_3 3 2.28 47.8
# 10 E1 G4 GENOTYPE_4 1 2.36 47.9
# # ... with 410 more rows
To remove all strings of a data frame, use remove_strings()
.
remove_strings(data_ge)
# # A tibble: 420 x 5
# ENV GEN REP GY HM
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 2.17 44.9
# 2 1 1 2 2.50 46.9
# 3 1 1 3 2.43 47.8
# 4 1 2 1 3.21 45.2
# 5 1 2 2 2.93 45.3
# 6 1 2 3 2.56 45.5
# 7 1 3 1 2.77 46.7
# 8 1 3 2 3.62 43.2
# 9 1 3 3 2.28 47.8
# 10 1 4 1 2.36 47.9
# # ... with 410 more rows