check outliers over the dataset
check_outliers.Rd
check outliers over the dataset
Usage
check_outliers(
dataset,
uuid_column = "uuid",
element_name = "checked_dataset",
kobo_survey = NULL,
kobo_choices = NULL,
cols_to_add_cleaning_log = NULL,
strongness_factor = 3,
minimum_unique_value_of_variable = NULL,
remove_choice_multiple = TRUE,
sm_separator = ".",
columns_not_to_check = NULL
)
Arguments
- dataset
dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset"
- uuid_column
UUID. Default is uuid
- element_name
name of the dataset in list
- kobo_survey
Kobo survey sheet. Default is NULL.
- kobo_choices
Kobo choices sheet. Default is NULL.
- cols_to_add_cleaning_log
Variables those must be included in the output
- strongness_factor
Strongness factor define how strong your outliers will be. The default is 3.
- minimum_unique_value_of_variable
Default is NULL, mean this parameter won't be considered. For example 10 means for any variable where number of unique value is less than 10, then the variable won't be considered for outlier checking.
- remove_choice_multiple
TRUE (default) will remove choice multiple questions from the output.
- sm_separator
Separator for choice multiple questions. The default is "."
- columns_not_to_check
Columns to exclude from the checks even if they are numeric values.
Value
return a list with the dataset checked stored as checked_dataset and a dataframe with the outliers log
Examples
dataset_outlier <- data.frame(
uuid = paste0("uuid_", 1:100),
one_value = c(round(runif(90, min = 45, max = 55)),
round(runif(5)), round(runif(5, 99, 100))),
expense = c(sample(200:500,
replace = TRUE, size = 95),
c(600, 100, 80, 1020, 1050)),
income = c(c(60, 0, 80, 1020, 1050),
sample(20000:50000,
replace = TRUE, size = 95)),
yy = c(rep(100, 99), 10)
)
check_outliers(dataset = dataset_outlier,
uuid_column = "uuid") |>
knitr::kable()
#> [1] "checking_one_value"
#> [1] "checking_expense"
#> [1] "checking_income"
#> [1] "checking_yy"
#>
#>
#> |uuid | one_value| expense| income| yy|
#> |:--------|---------:|-------:|------:|---:|
#> |uuid_1 | 52| 493| 60| 100|
#> |uuid_2 | 54| 481| 0| 100|
#> |uuid_3 | 52| 301| 80| 100|
#> |uuid_4 | 46| 345| 1020| 100|
#> |uuid_5 | 45| 334| 1050| 100|
#> |uuid_6 | 54| 291| 26745| 100|
#> |uuid_7 | 48| 356| 33039| 100|
#> |uuid_8 | 49| 240| 46348| 100|
#> |uuid_9 | 52| 402| 41300| 100|
#> |uuid_10 | 49| 452| 26168| 100|
#> |uuid_11 | 55| 358| 48669| 100|
#> |uuid_12 | 54| 474| 39139| 100|
#> |uuid_13 | 54| 378| 24139| 100|
#> |uuid_14 | 50| 212| 29721| 100|
#> |uuid_15 | 51| 298| 33614| 100|
#> |uuid_16 | 47| 374| 28146| 100|
#> |uuid_17 | 51| 294| 49733| 100|
#> |uuid_18 | 47| 484| 49323| 100|
#> |uuid_19 | 47| 252| 38004| 100|
#> |uuid_20 | 48| 447| 24262| 100|
#> |uuid_21 | 46| 389| 42262| 100|
#> |uuid_22 | 54| 258| 35281| 100|
#> |uuid_23 | 47| 370| 25840| 100|
#> |uuid_24 | 47| 356| 43295| 100|
#> |uuid_25 | 51| 347| 40482| 100|
#> |uuid_26 | 48| 260| 26899| 100|
#> |uuid_27 | 48| 303| 25093| 100|
#> |uuid_28 | 51| 275| 25310| 100|
#> |uuid_29 | 50| 352| 33109| 100|
#> |uuid_30 | 49| 464| 27606| 100|
#> |uuid_31 | 49| 440| 48220| 100|
#> |uuid_32 | 51| 500| 39080| 100|
#> |uuid_33 | 46| 455| 46805| 100|
#> |uuid_34 | 48| 316| 33885| 100|
#> |uuid_35 | 54| 319| 35508| 100|
#> |uuid_36 | 51| 487| 21745| 100|
#> |uuid_37 | 50| 312| 36469| 100|
#> |uuid_38 | 54| 301| 28567| 100|
#> |uuid_39 | 51| 268| 27697| 100|
#> |uuid_40 | 49| 493| 30667| 100|
#> |uuid_41 | 47| 244| 26243| 100|
#> |uuid_42 | 49| 418| 20572| 100|
#> |uuid_43 | 50| 221| 27488| 100|
#> |uuid_44 | 54| 393| 32131| 100|
#> |uuid_45 | 51| 376| 21886| 100|
#> |uuid_46 | 45| 423| 21910| 100|
#> |uuid_47 | 55| 384| 37576| 100|
#> |uuid_48 | 51| 274| 48702| 100|
#> |uuid_49 | 54| 236| 24046| 100|
#> |uuid_50 | 52| 493| 25600| 100|
#> |uuid_51 | 50| 412| 46541| 100|
#> |uuid_52 | 52| 497| 34092| 100|
#> |uuid_53 | 49| 487| 31383| 100|
#> |uuid_54 | 54| 368| 20056| 100|
#> |uuid_55 | 50| 416| 25855| 100|
#> |uuid_56 | 46| 282| 42760| 100|
#> |uuid_57 | 53| 250| 25162| 100|
#> |uuid_58 | 49| 203| 42320| 100|
#> |uuid_59 | 48| 299| 34478| 100|
#> |uuid_60 | 47| 236| 30256| 100|
#> |uuid_61 | 48| 236| 34236| 100|
#> |uuid_62 | 49| 395| 21488| 100|
#> |uuid_63 | 47| 260| 49727| 100|
#> |uuid_64 | 55| 486| 30918| 100|
#> |uuid_65 | 54| 297| 47621| 100|
#> |uuid_66 | 50| 265| 46476| 100|
#> |uuid_67 | 51| 417| 48774| 100|
#> |uuid_68 | 52| 264| 48889| 100|
#> |uuid_69 | 49| 471| 33197| 100|
#> |uuid_70 | 55| 246| 21526| 100|
#> |uuid_71 | 55| 204| 37606| 100|
#> |uuid_72 | 52| 329| 34349| 100|
#> |uuid_73 | 55| 376| 39134| 100|
#> |uuid_74 | 47| 471| 32686| 100|
#> |uuid_75 | 54| 239| 32775| 100|
#> |uuid_76 | 51| 493| 26015| 100|
#> |uuid_77 | 49| 363| 48343| 100|
#> |uuid_78 | 52| 239| 30559| 100|
#> |uuid_79 | 45| 348| 32333| 100|
#> |uuid_80 | 53| 246| 29938| 100|
#> |uuid_81 | 48| 340| 41108| 100|
#> |uuid_82 | 46| 303| 35980| 100|
#> |uuid_83 | 52| 351| 37547| 100|
#> |uuid_84 | 46| 214| 27684| 100|
#> |uuid_85 | 47| 235| 28778| 100|
#> |uuid_86 | 49| 405| 20704| 100|
#> |uuid_87 | 46| 378| 42268| 100|
#> |uuid_88 | 47| 448| 29114| 100|
#> |uuid_89 | 51| 203| 22802| 100|
#> |uuid_90 | 51| 321| 46274| 100|
#> |uuid_91 | 1| 366| 41396| 100|
#> |uuid_92 | 1| 386| 34163| 100|
#> |uuid_93 | 0| 396| 32013| 100|
#> |uuid_94 | 0| 439| 34298| 100|
#> |uuid_95 | 1| 476| 49465| 100|
#> |uuid_96 | 99| 600| 38727| 100|
#> |uuid_97 | 99| 100| 25660| 100|
#> |uuid_98 | 99| 80| 27429| 100|
#> |uuid_99 | 100| 1020| 21427| 100|
#> |uuid_100 | 99| 1050| 25215| 10|
#>
#> |uuid |issue |question | old_value|
#> |:--------|:-----------------------------|:---------|---------:|
#> |uuid_91 |outlier (normal distribution) |one_value | 1|
#> |uuid_92 |outlier (normal distribution) |one_value | 1|
#> |uuid_93 |outlier (normal distribution) |one_value | 0|
#> |uuid_94 |outlier (normal distribution) |one_value | 0|
#> |uuid_95 |outlier (normal distribution) |one_value | 1|
#> |uuid_96 |outlier (normal distribution) |one_value | 99|
#> |uuid_97 |outlier (normal distribution) |one_value | 99|
#> |uuid_98 |outlier (normal distribution) |one_value | 99|
#> |uuid_99 |outlier (normal distribution) |one_value | 100|
#> |uuid_100 |outlier (normal distribution) |one_value | 99|
#> |uuid_99 |outlier (normal distribution) |expense | 1020|
#> |uuid_100 |outlier (normal distribution) |expense | 1050|
#> |uuid_97 |outlier (log distribution) |expense | 100|
#> |uuid_98 |outlier (log distribution) |expense | 80|
#> |uuid_1 |outlier (log distribution) |income | 60|
#> |uuid_2 |outlier (log distribution) |income | 0|
#> |uuid_3 |outlier (log distribution) |income | 80|
#> |uuid_100 |outlier (normal distribution) |yy | 10|