Inspeção de dados
Conjunto de dados
library(metan)
library(rio)
# gerar tabelas html
print_tbl <- function(table, digits = 3, ...){
knitr::kable(table, booktabs = TRUE, digits = digits, ...)
}
# Dados "arrumados"
df_tidy <- import("http://bit.ly/df_tidy", setclass = "tbl")
Inspecionar dados
inspect(df_tidy)
## # A tibble: 13 x 9
## Variable Class Missing Levels Valid_n Min Median Max Outlier
## <chr> <chr> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 ENV character No 0 114 NA NA NA NA
## 2 GEN character No 0 114 NA NA NA NA
## 3 BLOCO character No 0 114 NA NA NA NA
## 4 ALT_PLANT numeric No - 114 0 2.52 3.04 1
## 5 ALT_ESP numeric No - 114 0.75 1.34 1.88 0
## 6 COMPES numeric No - 114 12.2 15.1 17.9 0
## 7 DIAMES numeric No - 114 43.5 49.8 54.9 0
## 8 COMP_SAB numeric No - 114 23.5 28.2 34.7 0
## 9 DIAM_SAB numeric No - 114 13.3 16.0 18.3 0
## 10 MGE numeric No - 114 106. 172. 251. 0
## 11 NFIL numeric No - 114 12.4 16 21.2 1
## 12 MMG character No 0 114 NA NA NA NA
## 13 NGE numeric Yes - 112 354 504. 5302 5
## Warning: Considering the levels of factors, .data should have 1 rows, but it has
## 114. Use 'as_factor()' for coercing a variable to a factor.
## Warning: Expected three or more factor variables. The data has only 0.
## Warning: Missing values in variable(s) NGE.
## Warning: Possible outliers in variable(s) ALT_PLANT, NFIL, NGE. Use
## 'find_outliers()' for more details.
## Warning: Zero values observed in variable(s) ALT_PLANT.
# converter as três primeiras colunas para fator
df_ok <-
df_tidy %>%
metan::as_factor(1:3)
Fragmentos de texto inesperados
# Encontrar fragmentos de texto
find_text_in_num(df_ok, MMG)
## [1] 112
df_ok[112, 12]
## # A tibble: 1 x 1
## MMG
## <chr>
## 1 335..2
# substitui '..' por '.' e converte para numérico
df_ok <-
df_ok %>%
replace_string(MMG, pattern = "\\.{2}", replacement = ".") %>%
as_numeric(MMG)
Substituir zeros por NA
df_ok <-
df_ok %>%
replace_zero(ALT_PLANT)
# Nova inspeção
inspect(df_ok, plot = TRUE)
## # A tibble: 13 x 9
## Variable Class Missing Levels Valid_n Min Median Max Outlier
## <chr> <chr> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 ENV factor No 3 114 NA NA NA NA
## 2 GEN factor No 13 114 NA NA NA NA
## 3 BLOCO factor No 3 114 NA NA NA NA
## 4 ALT_PLANT numeric Yes - 113 1.71 2.52 3.04 0
## 5 ALT_ESP numeric No - 114 0.75 1.34 1.88 0
## 6 COMPES numeric No - 114 12.2 15.1 17.9 0
## 7 DIAMES numeric No - 114 43.5 49.8 54.9 0
## 8 COMP_SAB numeric No - 114 23.5 28.2 34.7 0
## 9 DIAM_SAB numeric No - 114 13.3 16.0 18.3 0
## 10 MGE numeric No - 114 106. 172. 251. 0
## 11 NFIL numeric No - 114 12.4 16 21.2 1
## 12 MMG numeric No - 114 227. 336. 452. 0
## 13 NGE numeric Yes - 112 354 504. 5302 5
## Warning: Considering the levels of factors, .data should have 117 rows, but it
## has 114. Use 'as_factor()' for coercing a variable to a factor.
## Warning: Missing values in variable(s) ALT_PLANT, NGE.
## Warning: Possible outliers in variable(s) NFIL, NGE. Use 'find_outliers()' for
## more details.
Outliers
# Outlier NFIL
find_outliers(df_ok, NFIL, plots = TRUE)
## Trait: NFIL
## Number of possible outliers: 1
## Line(s): 14
## Proportion: 0.9%
## Mean of the outliers: 21.2
## Maximum of the outliers: 21.2 | Line 14
## Minimum of the outliers: 21.2 | Line 14
## With outliers: mean = 16.154 | CV = 10.833%
## Without outliers: mean = 16.11 | CV = 10.499%
# Outlier NGE
find_outliers(df_ok, NGE, plots = TRUE)
## Trait: NGE
## Number of possible outliers: 5
## Line(s): 11 14 22 66 89
## Proportion: 4.7%
## Mean of the outliers: 1606.8
## Maximum of the outliers: 5302 | Line 22
## Minimum of the outliers: 667.8 | Line 89
## With outliers: mean = 550.623 | CV = 83.324%
## Without outliers: mean = 501.269 | CV = 13.121%
# Corrigir valores
df_ok[22, 13] <- 530.2
# Exportar df_ok
# export(df_ok, "df_ok.xlsx")