Skip to contents

check outliers over the dataset

Usage

check_outliers(
  dataset,
  uuid_column = "uuid",
  element_name = "checked_dataset",
  kobo_survey = NULL,
  kobo_choices = NULL,
  cols_to_add_cleaning_log = NULL,
  strongness_factor = 3,
  minimum_unique_value_of_variable = NULL,
  remove_choice_multiple = TRUE,
  sm_separator = ".",
  columns_not_to_check = NULL
)

Arguments

dataset

dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset"

uuid_column

UUID. Default is uuid

element_name

name of the dataset in list

kobo_survey

Kobo survey sheet. Default is NULL.

kobo_choices

Kobo choices sheet. Default is NULL.

cols_to_add_cleaning_log

Variables those must be included in the output

strongness_factor

Strongness factor define how strong your outliers will be. The default is 3.

minimum_unique_value_of_variable

Default is NULL, mean this parameter won't be considered. For example 10 means for any variable where number of unique value is less than 10, then the variable won't be considered for outlier checking.

remove_choice_multiple

TRUE (default) will remove choice multiple questions from the output.

sm_separator

Separator for choice multiple questions. The default is "."

columns_not_to_check

Columns to exclude from the checks even if they are numeric values.

Value

return a list with the dataset checked stored as checked_dataset and a dataframe with the outliers log

Examples


dataset_outlier <- data.frame(
  uuid = paste0("uuid_", 1:100),
  one_value = c(round(runif(90, min = 45, max = 55)), 
                round(runif(5)), round(runif(5, 99, 100))),
  expense = c(sample(200:500, 
                     replace = TRUE, size = 95), 
              c(600, 100, 80, 1020, 1050)),
  income = c(c(60, 0, 80, 1020, 1050), 
             sample(20000:50000, 
                    replace = TRUE, size = 95)),
  yy = c(rep(100, 99), 10)
)

check_outliers(dataset = dataset_outlier,
               uuid_column = "uuid") |>
  knitr::kable()
#> [1] "checking_one_value"
#> [1] "checking_expense"
#> [1] "checking_income"
#> [1] "checking_yy"
#> 
#> 
#> |uuid     | one_value| expense| income|  yy|
#> |:--------|---------:|-------:|------:|---:|
#> |uuid_1   |        52|     493|     60| 100|
#> |uuid_2   |        54|     481|      0| 100|
#> |uuid_3   |        52|     301|     80| 100|
#> |uuid_4   |        46|     345|   1020| 100|
#> |uuid_5   |        45|     334|   1050| 100|
#> |uuid_6   |        54|     291|  26745| 100|
#> |uuid_7   |        48|     356|  33039| 100|
#> |uuid_8   |        49|     240|  46348| 100|
#> |uuid_9   |        52|     402|  41300| 100|
#> |uuid_10  |        49|     452|  26168| 100|
#> |uuid_11  |        55|     358|  48669| 100|
#> |uuid_12  |        54|     474|  39139| 100|
#> |uuid_13  |        54|     378|  24139| 100|
#> |uuid_14  |        50|     212|  29721| 100|
#> |uuid_15  |        51|     298|  33614| 100|
#> |uuid_16  |        47|     374|  28146| 100|
#> |uuid_17  |        51|     294|  49733| 100|
#> |uuid_18  |        47|     484|  49323| 100|
#> |uuid_19  |        47|     252|  38004| 100|
#> |uuid_20  |        48|     447|  24262| 100|
#> |uuid_21  |        46|     389|  42262| 100|
#> |uuid_22  |        54|     258|  35281| 100|
#> |uuid_23  |        47|     370|  25840| 100|
#> |uuid_24  |        47|     356|  43295| 100|
#> |uuid_25  |        51|     347|  40482| 100|
#> |uuid_26  |        48|     260|  26899| 100|
#> |uuid_27  |        48|     303|  25093| 100|
#> |uuid_28  |        51|     275|  25310| 100|
#> |uuid_29  |        50|     352|  33109| 100|
#> |uuid_30  |        49|     464|  27606| 100|
#> |uuid_31  |        49|     440|  48220| 100|
#> |uuid_32  |        51|     500|  39080| 100|
#> |uuid_33  |        46|     455|  46805| 100|
#> |uuid_34  |        48|     316|  33885| 100|
#> |uuid_35  |        54|     319|  35508| 100|
#> |uuid_36  |        51|     487|  21745| 100|
#> |uuid_37  |        50|     312|  36469| 100|
#> |uuid_38  |        54|     301|  28567| 100|
#> |uuid_39  |        51|     268|  27697| 100|
#> |uuid_40  |        49|     493|  30667| 100|
#> |uuid_41  |        47|     244|  26243| 100|
#> |uuid_42  |        49|     418|  20572| 100|
#> |uuid_43  |        50|     221|  27488| 100|
#> |uuid_44  |        54|     393|  32131| 100|
#> |uuid_45  |        51|     376|  21886| 100|
#> |uuid_46  |        45|     423|  21910| 100|
#> |uuid_47  |        55|     384|  37576| 100|
#> |uuid_48  |        51|     274|  48702| 100|
#> |uuid_49  |        54|     236|  24046| 100|
#> |uuid_50  |        52|     493|  25600| 100|
#> |uuid_51  |        50|     412|  46541| 100|
#> |uuid_52  |        52|     497|  34092| 100|
#> |uuid_53  |        49|     487|  31383| 100|
#> |uuid_54  |        54|     368|  20056| 100|
#> |uuid_55  |        50|     416|  25855| 100|
#> |uuid_56  |        46|     282|  42760| 100|
#> |uuid_57  |        53|     250|  25162| 100|
#> |uuid_58  |        49|     203|  42320| 100|
#> |uuid_59  |        48|     299|  34478| 100|
#> |uuid_60  |        47|     236|  30256| 100|
#> |uuid_61  |        48|     236|  34236| 100|
#> |uuid_62  |        49|     395|  21488| 100|
#> |uuid_63  |        47|     260|  49727| 100|
#> |uuid_64  |        55|     486|  30918| 100|
#> |uuid_65  |        54|     297|  47621| 100|
#> |uuid_66  |        50|     265|  46476| 100|
#> |uuid_67  |        51|     417|  48774| 100|
#> |uuid_68  |        52|     264|  48889| 100|
#> |uuid_69  |        49|     471|  33197| 100|
#> |uuid_70  |        55|     246|  21526| 100|
#> |uuid_71  |        55|     204|  37606| 100|
#> |uuid_72  |        52|     329|  34349| 100|
#> |uuid_73  |        55|     376|  39134| 100|
#> |uuid_74  |        47|     471|  32686| 100|
#> |uuid_75  |        54|     239|  32775| 100|
#> |uuid_76  |        51|     493|  26015| 100|
#> |uuid_77  |        49|     363|  48343| 100|
#> |uuid_78  |        52|     239|  30559| 100|
#> |uuid_79  |        45|     348|  32333| 100|
#> |uuid_80  |        53|     246|  29938| 100|
#> |uuid_81  |        48|     340|  41108| 100|
#> |uuid_82  |        46|     303|  35980| 100|
#> |uuid_83  |        52|     351|  37547| 100|
#> |uuid_84  |        46|     214|  27684| 100|
#> |uuid_85  |        47|     235|  28778| 100|
#> |uuid_86  |        49|     405|  20704| 100|
#> |uuid_87  |        46|     378|  42268| 100|
#> |uuid_88  |        47|     448|  29114| 100|
#> |uuid_89  |        51|     203|  22802| 100|
#> |uuid_90  |        51|     321|  46274| 100|
#> |uuid_91  |         1|     366|  41396| 100|
#> |uuid_92  |         1|     386|  34163| 100|
#> |uuid_93  |         0|     396|  32013| 100|
#> |uuid_94  |         0|     439|  34298| 100|
#> |uuid_95  |         1|     476|  49465| 100|
#> |uuid_96  |        99|     600|  38727| 100|
#> |uuid_97  |        99|     100|  25660| 100|
#> |uuid_98  |        99|      80|  27429| 100|
#> |uuid_99  |       100|    1020|  21427| 100|
#> |uuid_100 |        99|    1050|  25215|  10|
#> 
#> |uuid     |issue                         |question  | old_value|
#> |:--------|:-----------------------------|:---------|---------:|
#> |uuid_91  |outlier (normal distribution) |one_value |         1|
#> |uuid_92  |outlier (normal distribution) |one_value |         1|
#> |uuid_93  |outlier (normal distribution) |one_value |         0|
#> |uuid_94  |outlier (normal distribution) |one_value |         0|
#> |uuid_95  |outlier (normal distribution) |one_value |         1|
#> |uuid_96  |outlier (normal distribution) |one_value |        99|
#> |uuid_97  |outlier (normal distribution) |one_value |        99|
#> |uuid_98  |outlier (normal distribution) |one_value |        99|
#> |uuid_99  |outlier (normal distribution) |one_value |       100|
#> |uuid_100 |outlier (normal distribution) |one_value |        99|
#> |uuid_99  |outlier (normal distribution) |expense   |      1020|
#> |uuid_100 |outlier (normal distribution) |expense   |      1050|
#> |uuid_97  |outlier (log distribution)    |expense   |       100|
#> |uuid_98  |outlier (log distribution)    |expense   |        80|
#> |uuid_1   |outlier (log distribution)    |income    |        60|
#> |uuid_2   |outlier (log distribution)    |income    |         0|
#> |uuid_3   |outlier (log distribution)    |income    |        80|
#> |uuid_100 |outlier (normal distribution) |yy        |        10|