Skip to contents

Checks for duplicated values in columns

Usage

check_duplicate(
  dataset,
  uuid_column = "uuid",
  columns_to_check = NULL,
  log_name = "duplicate_log"
)

Arguments

dataset

dataset to be check as a dataframe or a list with the dataframe stored as "checked_dataset".

uuid_column

uuid column in the dataset. Default is uuid.

columns_to_check

string character with the name of the columns to check. If NULL (default), it will check for the uuid_column

log_name

name of the log of flagged value

Value

return a list with the dataset checked stored as checked_dataset and a dataframe with the duplicate log

Examples


testdata <- data.frame(
  uuid = c(letters[1:4], "a", "b", "c"),
  col_a = runif(7),
  col_b = runif(7)
)
 
check_duplicate(testdata) |>
  knitr::kable()  
#> 
#> 
#> |uuid |     col_a|     col_b|
#> |:----|---------:|---------:|
#> |a    | 0.3974663| 0.8077894|
#> |b    | 0.6818843| 0.6999456|
#> |c    | 0.0177946| 0.5466694|
#> |d    | 0.1220869| 0.5058249|
#> |a    | 0.7882961| 0.2248206|
#> |b    | 0.4744956| 0.9330499|
#> |c    | 0.6247803| 0.5299978|
#> 
#> |uuid |old_value |question |issue           |
#> |:----|:---------|:--------|:---------------|
#> |a    |a         |uuid     |duplicated uuid |
#> |b    |b         |uuid     |duplicated uuid |
#> |c    |c         |uuid     |duplicated uuid |
 
testdata2 <- data.frame(
  uuid = letters[c(1:7)],
  village = paste("village", c(1:3, 1:3, 4)),
  ki_identifier = paste0("xx_", c(1:5, 3, 4))
)
 
check_duplicate(testdata2, 
                columns_to_check = "village") |>
  knitr::kable()  
#> 
#> 
#> |uuid |village   |ki_identifier |
#> |:----|:---------|:-------------|
#> |a    |village 1 |xx_1          |
#> |b    |village 2 |xx_2          |
#> |c    |village 3 |xx_3          |
#> |d    |village 1 |xx_4          |
#> |e    |village 2 |xx_5          |
#> |f    |village 3 |xx_3          |
#> |g    |village 4 |xx_4          |
#> 
#> |uuid |question |old_value |issue              |
#> |:----|:--------|:---------|:------------------|
#> |d    |village  |village 1 |duplicated village |
#> |e    |village  |village 2 |duplicated village |
#> |f    |village  |village 3 |duplicated village |
 
check_duplicate(testdata2, 
                columns_to_check = c("village", "ki_identifier"),
                uuid = "uuid") |>
  knitr::kable()  
#> 
#> 
#> |uuid |village   |ki_identifier |
#> |:----|:---------|:-------------|
#> |a    |village 1 |xx_1          |
#> |b    |village 2 |xx_2          |
#> |c    |village 3 |xx_3          |
#> |d    |village 1 |xx_4          |
#> |e    |village 2 |xx_5          |
#> |f    |village 3 |xx_3          |
#> |g    |village 4 |xx_4          |
#> 
#> |uuid |question      |old_value |issue                                |
#> |:----|:-------------|:---------|:------------------------------------|
#> |f    |village       |village 3 |duplicated village ~/~ ki_identifier |
#> |f    |ki_identifier |xx_3      |duplicated village ~/~ ki_identifier |