Tutorial

library(tidyverse)
library(cleaningtools)

1. Checks on the dataset

1.1 Check of PII

check_pii() function takes raw data (input can be dataframe or list. However incase of list, you must specify the element name in element_name parameter!) and looks for potential PII in the dataset. By default, the function will look for following words but you can also add additional words to look by using words_to_look parameter.The default words are-c("telephone","contact","name","gps","neighbourhood","latitude","logitude","contact","nom","gps","voisinage"). The function will give a list with two element. One will be the data and second one will be the list of potential PII

Using dataframe as input

## Demo dataset
dataset <- cleaningtools::cleaningtools_raw_data

dataset |> head(5)  |> knitr::kable()

X.U.FEFF.start	end	date_assessment	deviceid	enumerator_num	neighbourhood	consent_remote	hhh_r	will_to_response_r	age_respondent_r	age_hoh	gender_hoh	displace_status	displace_status_returnee	return_date	num_hh_member	hh_hosting	num_hh_hosting	hh_hosted	shelter_occupation	shelter_occupation_other	property_title	rental_contract	inc_employment_pension	tot_expenses	primary_livelihood	primary_livelihood.savings	primary_livelihood.employment	primary_livelihood.remittences	primary_livelihood.retirement_fund	primary_livelihood.income_renting	primary_livelihood.selling_assets	primary_livelihood.selling_assisantance	primary_livelihood.loans	primary_livelihood.modm	primary_livelihood.support	primary_livelihood.ngo	primary_livelihood.social_service	primary_livelihood.illegal	primary_livelihood.zakat	primary_livelihood.other	primary_livelihood.dont_know	primary_livelihood.prefer_not_answer	primary_livelihood.own_business	primary_livelihood_other	type_employment	water_sources	water_sources.piped	water_sources.tap	water_sources.borehole	water_sources.well	water_sources.bottled	water_sources.trucking	water_sources.spring	water_sources.rainwater	water_sources.surface	water_sources.other	water_sources.dont_know	water_source_drinking	treat_drink_water	treat_drink_water_how	treat_drink_water_how.boil	treat_drink_water_how.let_stand	treat_drink_water_how.expose_sunlight	treat_drink_water_how.use_chlorine	treat_drink_water_how.filter	treat_drink_water_how.other	treat_drink_water_how.dont_know	water_source_cook	treat_cook_water	treat_cook_water_how	treat_cook_water_how.boil	treat_cook_water_how.let_stand	treat_cook_water_how.expose_sunlight	treat_cook_water_how.use_chlorine	treat_cook_water_how.filter	treat_cook_water_how.other	treat_cook_water_how.dont_know	piped_quality	piped_quality.acceptable	piped_quality.unacceptable_taste	piped_quality.unacceptable_colour	piped_quality.unacceptable_smell	piped_quality.materials	piped_quality.no_other	piped_quality.dont_know	tap_quality	tap_quality.acceptable	tap_quality.unacceptable_taste	tap_quality.unacceptable_colour	tap_quality.unacceptable_smell	tap_quality.materials	tap_quality.no_other	tap_quality.dont_know	borehole_quality	borehole_quality.acceptable	borehole_quality.unacceptable_taste	borehole_quality.unacceptable_colour	borehole_quality.unacceptable_smell	borehole_quality.materials	borehole_quality.no_other	borehole_quality.dont_know	trucking_quality	trucking_quality.acceptable	trucking_quality.unacceptable_taste	trucking_quality.unacceptable_colour	trucking_quality.unacceptable_smell	trucking_quality.materials	trucking_quality.no_other	trucking_quality.dont_know	water_source_bath	water_source_clothes	water_source_house	own_animals	water_source_animals	cultivate_crops	water_source_crops	days_available_piped	days_available_tap	days_available_borehole	days_available_trucking	hours_available_piped	hours_available_tap	hours_available_borehole	hours_available_trucking	satisfaction_piped	satisfaction_tap	satisfaction_borehole	satisfaction_bottled	satisfaction_trucking	spend_piped	spend_tap	spend_borehole	spend_bottled	spend_trucking	water_tank	water_tank_nb_sharing	water_tank_litres	water_tank_litres_nb	water_tank_consent	water_tank_shape	water_tank_height	water_tank_diameter	water_tank_obs_number	tank_store_piped	tank_store_tap	tank_store_borehole	tank_store_trucking	tank_fill_piped	tank_fill_tap	tank_fill_borehole	tank_fill_trucking	tank_emptied	bottle_volume	bottles_per_week	number_pumps	pump_horsepower	pump_days	pump_hours	pump_connection_piped_sys	responsible_collect_water	seasonal_variation	air_coolers	air_coolers_nb	air_coolers_hours	access_water_enough	access_water_enough_why_not	access_water_enough_why_not.not_enough_hours	access_water_enough_why_not.amount_not_enough	access_water_enough_why_not.inconvenient_hours	access_water_enough_why_not.supply_inconsistent	access_water_enough_why_not.too_expensive	access_water_enough_why_not.quality_poor	access_water_enough_why_not.water_pressure	access_water_enough_why_not.sources_closed	access_water_enough_why_not.not_available_shops	access_water_enough_why_not.tank_capacity	access_water_enough_why_not.points_difficult_reach	access_water_enough_why_not.some_groups_no_access	access_water_enough_why_not.other	access_water_enough_why_not.dont_know	access_water_enough_why_not.prefer_not_answer	access_water_enough_why_not_other	money_coping_strategy_water	water_system_connection	connection_fees	connection_fees_amount	pay_water_charges_when	pay_water_charges_last	pay_water_charges_amount	pay_water_charges_method	pay_water_charges_prefer	pay_water_charges_why_not	pay_water_charges_why_not_other	value_water_service	pay_more_water_service	water_meter_know	water_meter_willing	why_share_connection	why_share_connection.cannot_afford	why_share_connection.legal_status_property	why_share_connection.legal_status_hh	why_share_connection.paperwork	why_share_connection.not_worth	why_share_connection.government_responsibility	why_share_connection.civil_docs	why_share_connection.staff_refused	why_share_connection.other	why_share_connection.dont_know	why_share_connection.prefer_not_answer	why_no_subscription	why_no_subscription.cannot_afford	why_no_subscription.legal_status_property	why_no_subscription.legal_status_hh	why_no_subscription.paperwork	why_no_subscription.not_worth	why_no_subscription.government_responsibility	why_no_subscription.civil_docs	why_no_subscription.staff_refused	why_no_subscription.other	why_no_subscription.dont_know	why_no_subscription.prefer_not_answer	own_subscription_want	own_subscription_assist	own_subscription_assist_type	own_subscription_assist_type.paperwork	own_subscription_assist_type.legalising_property	own_subscription_assist_type.civil_docs	own_subscription_assist_type.dealing_staff	own_subscription_assist_type.paying	own_subscription_assist_type.other	own_subscription_assist_type.dont_know	problems_water	problems_water_who_appr	problems_water_who_appr.nobody	problems_water_who_appr.water_office	problems_water_who_appr.municipal_staff	problems_water_who_appr.community_rep	problems_water_who_appr.ninewa_dow	problems_water_who_appr.ngo	problems_water_who_appr.plumber	problems_water_who_appr.other	problems_water_who_appr.dont_know	problems_water_who_appr.prefer_not_answer	problems_water_who_appr_not	problems_water_who_appr_not_other	problems_water_action	problems_water_time	problems_water_who_attend	problems_water_who_attend.nobody	problems_water_who_attend.water_office	problems_water_who_attend.municipal_staff	problems_water_who_attend.community_rep	problems_water_who_attend.ninewa_dow	problems_water_who_attend.ngo	problems_water_who_attend.plumber	problems_water_who_attend.other	problems_water_who_attend.dont_know	problems_water_who_attend.prefer_not_answer	problems_water_main_complaint	problems_water_main_complaint_other	leaks	leaks_reported	level_service	level_service_why_poor	level_service_why_poor_other	regularity_supply	water_office_receptiveness	water_supply_before_2014	water_supply_rest_neighbourhood	water_supply_other_neighbourhoods	water_supply_other_neighbourhoods_how	water_supply_other_neighbourhoods_how.less_hours_days	water_supply_other_neighbourhoods_how.smaller_quantity	water_supply_other_neighbourhoods_how.hours_less_inconvenient	water_supply_other_neighbourhoods_how.charges_higher	water_supply_other_neighbourhoods_how.quality_poorer	water_supply_other_neighbourhoods_how.pressure_lower	water_supply_other_neighbourhoods_how.infrastructure_poorer	water_supply_other_neighbourhoods_how.less_infrastructure	water_supply_other_neighbourhoods_how.other	water_supply_other_neighbourhoods_how.prefer_not_answer	water_supply_other_neighbourhoods_why	majority_pay	water_office_fixes_leaks	water_office_maintains	received_outreach	trust_water_office	trust_water_office_why_not	users_build_trust	users_build_trust.nothing	users_build_trust.pay_bills	users_build_trust.report_leaks	users_build_trust.conserve_water	users_build_trust.other	users_build_trust.dont_know	users_build_trust.prefer_not_answer	suggestions_improve_services	suggestions_improve_services.increase_hours	suggestions_improve_services.hours_more_convenient	suggestions_improve_services.improve_quality	suggestions_improve_services.reduce_price	suggestions_improve_services.increase_pressure	suggestions_improve_services.improve_infrastructure	suggestions_improve_services.new_infrastructure	suggestions_improve_services.extend_network	suggestions_improve_services.repair_leaks	suggestions_improve_services.other	suggestions_improve_services.dont_know	suggestions_improve_services.prefer_not_answer	suggestions_improve_services_other	help_office_improve	help_office_improve.cannot_help	help_office_improve.pay_bills	help_office_improve.report_leaks	help_office_improve.conserve_water	help_office_improve.other	help_office_improve.dont_know	help_office_improve.prefer_not_answer	how_engaged_by_office	how_engaged_by_office.prefer_not_engaged	how_engaged_by_office.face_to_face_home	how_engaged_by_office.face_to_face_office	how_engaged_by_office.community_rep	how_engaged_by_office.phone	how_engaged_by_office.sms	how_engaged_by_office.email	how_engaged_by_office.letter	how_engaged_by_office.twitter	how_engaged_by_office.facebook	how_engaged_by_office.whatsapp	how_engaged_by_office.complaints_box	how_engaged_by_office.billboards	how_engaged_by_office.posters	how_engaged_by_office.leaflets	how_engaged_by_office.other	how_engaged_by_office.prefer_not_answer	prefer_not_engage	prefer_not_engage_other	consent_telephone_number	consent_share_location	X_id	X_uuid	X_submission_time	X_notes	X_status	X_submitted_by	X_index
2021-07-05T10:58:34.147+03:00	2021-07-05T11:36:41.138+03:00	2021-07-05	collect:0uAiKDfbbpqENdqw	13	A2	yes	yes	NA	52	NA	male	yes	yes	2018-06-11	7	no	NA	no	owned	NA	no	NA	250000	250000	loans support	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	NA	piped	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	piped	never_treat	NA	NA	NA	NA	NA	NA	NA	NA	piped	never_treat	NA	NA	NA	NA	NA	NA	NA	NA	acceptable	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	piped	piped	piped	yes	piped	yes	piped	1	NA	NA	NA	7	NA	NA	NA	very_dissatisfied	NA	NA	NA	NA	NA	NA	NA	NA	NA	yes	7	yes	5000	NA	NA	NA	NA	NA	yes	NA	NA	NA	4	NA	NA	NA	fully	NA	NA	NA	NA	NA	NA	yes	no_specific_person	some_more_summer	yes	2	19	just_enough	not_enough_hours inconvenient_hours supply_inconsistent	TRUE	FALSE	TRUE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	no	permanent_subscription	yes	9000	every	past_2_months	9000	cash	representative_home	NA	NA	price_equal	yes	no	no	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	yes	plumber	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	FALSE	NA	NA	no	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	network_broker	NA	yes	no	adequate	NA	NA	very_poor	adequate	stayed_same	stayed_same	somewhat_worse	less_hours_days infrastructure_poorer hours_less_inconvenient	TRUE	FALSE	TRUE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	FALSE	خارج حدود البلدية	yes	no	no	no	yes	NA	pay_bills report_leaks	FALSE	TRUE	TRUE	FALSE	FALSE	FALSE	FALSE	increase_hours hours_more_convenient improve_infrastructure extend_network repair_leaks	TRUE	TRUE	FALSE	FALSE	FALSE	TRUE	FALSE	TRUE	TRUE	FALSE	FALSE	FALSE	NA	conserve_water pay_bills	FALSE	TRUE	FALSE	TRUE	FALSE	FALSE	FALSE	face_to_face_home	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	yes	yes	190999135	dcf2753a-6ea2-40f5-b493-3527931ef96c	2021-07-05T09:34:15	[]	submitted_via_web	reach_irq	1
2021-07-05T10:08:54.905+03:00	2021-07-05T10:41:55.060+03:00	2021-07-05	collect:0uAiKDfbbpqENdqw	13	A2	yes	yes	NA	50	NA	male	yes	yes	2019-08-13	7	no	NA	no	granted_organisation	NA	NA	NA	1000000	750000	employment	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	permanent_job	tap	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	tap	sometimes_treat	filter	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	tap	sometimes_treat	filter	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	NA	NA	NA	NA	NA	NA	NA	NA	acceptable	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	tap	tap	tap	no	NA	no	NA	NA	7	NA	NA	NA	15	NA	NA	NA	somewhat_dissatisfied	NA	NA	NA	NA	10000	NA	NA	NA	yes	7	yes	1000	NA	NA	NA	NA	NA	NA	yes	NA	NA	NA	6	NA	NA	about_half	NA	NA	NA	NA	NA	NA	NA	no_specific_person	some_more_summer	yes	2	15	just_enough	not_enough_hours water_pressure	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	no	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	yes	plumber	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	FALSE	NA	NA	no	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	network_broker	NA	yes	no	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	yes	yes	yes	no	yes	NA	pay_bills report_leaks conserve_water	FALSE	TRUE	TRUE	TRUE	FALSE	FALSE	FALSE	increase_hours increase_pressure new_infrastructure extend_network	TRUE	FALSE	FALSE	FALSE	TRUE	FALSE	TRUE	TRUE	FALSE	FALSE	FALSE	FALSE	NA	pay_bills report_leaks conserve_water	FALSE	TRUE	TRUE	TRUE	FALSE	FALSE	FALSE	community_rep facebook	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	yes	yes	190999084	8790ce5c-1c35-41a2-b3c0-538f937d5397	2021-07-05T09:34:09	[]	submitted_via_web	reach_irq	2
2021-07-05T11:22:14.739+03:00	2021-07-05T12:06:26.056+03:00	2021-07-05	collect:OhMOqrmZvjx0RfGF	8	A2	yes	yes	NA	33	NA	male	yes	yes	2018-04-04	5	no	NA	no	renting	NA	NA	no	300000	250000	other	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	NA	عامل نجاره	NA	piped borehole	TRUE	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	piped	always_treat	filter	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	piped	sometimes_treat	let_stand	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	unacceptable_taste	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	NA	NA	NA	NA	NA	NA	unacceptable_taste	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	NA	NA	NA	NA	NA	NA	piped	piped	piped	no	NA	no	NA	2	NA	2	NA	4	NA	3	NA	somewhat_dissatisfied	NA	somewhat_dissatisfied	NA	NA	NA	NA	23000	NA	NA	yes	5	no	NA	yes	cylinder	150	75	3	yes	NA	no	NA	4	NA	NA	NA	fully	NA	NA	1	1.5	3	4	yes	adult_man	some_more_summer	yes	2	24	just_enough	amount_not_enough not_enough_hours	TRUE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	no	permanent_subscription	no	NA	most	past_2_months	33000	cash	office	NA	NA	price_equal	yes	yes	yes	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	no	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	yes	yes	adequate	NA	NA	adequate	poor	much_better	stayed_same	much_better	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	no	no	no	no	yes	NA	pay_bills	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	increase_hours improve_quality	TRUE	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	pay_bills	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	face_to_face_home	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	yes	yes	190999246	bb818e04-9c40-408e-919f-6b40ff1fdbb3	2021-07-05T09:34:33	[]	submitted_via_web	reach_irq	3
2021-07-04T22:22:59.192+03:00	2021-07-04T23:00:09.446+03:00	2021-07-04	collect:Q2LAMRy7dqL4ONE6	15	A2	yes	yes	NA	57	NA	male	yes	yes	2019-12-04	8	no	NA	no	owned	NA	yes	NA	750000	600000	employment	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	permanent_job	piped	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	piped	always_treat	filter	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	piped	always_treat	filter	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	acceptable	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	piped	piped	piped	no	NA	no	NA	3	NA	NA	NA	20	NA	NA	NA	somewhat_satisfied	NA	NA	NA	NA	NA	NA	NA	NA	NA	yes	8	yes	4000	NA	NA	NA	NA	NA	yes	NA	NA	NA	10	NA	NA	NA	about_half	NA	NA	NA	NA	NA	NA	yes	adult_woman	some_more_summer	yes	3	12	sufficient	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	no	temporary_subscription	no	NA	most	past_2_months	9000	cash	representative_home	NA	NA	price_lower	yes	yes	yes	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	no	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	no	NA	good	NA	NA	good	good	somewhat_better	stayed_same	somewhat_worse	pressure_lower less_hours_days	TRUE	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	بعيدا	yes	no	yes	no	yes	NA	pay_bills	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	improve_quality hours_more_convenient	FALSE	TRUE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	pay_bills	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	face_to_face_home	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	yes	yes	190998651	28b90cbb-2cf0-41c5-9ee1-1c719c0d4c02	2021-07-05T09:32:58	[]	submitted_via_web	reach_irq	4
2021-07-04T23:11:37.273+03:00	2021-07-05T00:18:53.413+03:00	2021-07-04	collect:Q2LAMRy7dqL4ONE6	15	A2	yes	yes	NA	39	NA	male	yes	yes	2018-12-01	10	no	NA	no	owned	NA	yes	NA	650000	500000	employment	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	day_labour	piped	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	piped	always_treat	expose_sunlight	FALSE	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	piped	sometimes_treat	expose_sunlight	FALSE	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	acceptable	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	piped	piped	piped	no	NA	no	NA	2	NA	NA	NA	8	NA	NA	NA	neither_satisfied	NA	NA	NA	NA	NA	NA	NA	NA	NA	yes	10	yes	5000	NA	NA	NA	NA	NA	yes	NA	NA	NA	8	NA	NA	NA	mostly	NA	NA	NA	NA	NA	NA	yes	adult_woman	some_more_summer	yes	1	12	sufficient	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	no	temporary_subscription	no	NA	most	2_4_months	9000	cash	representative_home	NA	NA	price_lower	yes	yes	yes	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	no	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	no	NA	good	NA	NA	adequate	good	somewhat_better	somewhat_better	somewhat_better	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	yes	yes	yes	no	yes	NA	pay_bills	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	increase_hours improve_quality repair_leaks	TRUE	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	TRUE	FALSE	FALSE	FALSE	NA	pay_bills report_leaks	FALSE	TRUE	TRUE	FALSE	FALSE	FALSE	FALSE	face_to_face_home	FALSE	TRUE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	FALSE	NA	NA	yes	yes	190998662	7f2a0c6a-529b-481f-963f-a96dca2ec034	2021-07-05T09:32:59	[]	submitted_via_web	reach_irq	5

output_from_data <- cleaningtools::check_pii(dataset = dataset, 
                                             words_to_look = "date", 
                                             uuid_column = "X_uuid")

output_from_data$potential_PII |> 
  head() |> 
  knitr::kable()

uuid	question	issue
all	date_assessment	Potential PII
all	neighbourhood	Potential PII
all	return_date	Potential PII
all	water_supply_rest_neighbourhood	Potential PII
all	water_supply_other_neighbourhoods	Potential PII
all	water_supply_other_neighbourhoods_why	Potential PII

Using list as input

raw_data = cleaningtools::cleaningtools_raw_data
### from list
df_list <- list(raw_data = raw_data)
output_from_list <- cleaningtools::check_pii(dataset = df_list, 
                                             element_name = "raw_data", 
                                             words_to_look = "date", 
                                             uuid_column = "X_uuid")

output_from_list$potential_PII |>
  head() |> 
  knitr::kable()

uuid	question	issue
all	date_assessment	Potential PII
all	neighbourhood	Potential PII
all	return_date	Potential PII
all	water_supply_rest_neighbourhood	Potential PII
all	water_supply_other_neighbourhoods	Potential PII
all	water_supply_other_neighbourhoods_why	Potential PII

1.2 Check of duration from audits

1.2.1 Reading the audits files

It will read only the compressed file.

my_audit_list <- cleaningtools::create_audit_list(audit_zip_path = "audit_for_tests_100.zip")

1.2.2 Adding the duration to the dataset

Once you have read your audit file from the zip, you will get a list of audit. You can use this list to calculate and add the duration. You have 2 options with a start and end question or summing all the durations.

list_audit <- list(
  uuid1 = data.frame(
    event = c("form start", rep("question", 5)),
    node = c("", paste0("/xx/question", 1:5)),
    start = c(
      1661415887295, 1661415887301,
      1661415890819, 1661415892297,
      1661415893529, 1661415894720
    ),
    end = c(
      NA, 1661415890790, 1661415892273,
      1661415893506, 1661415894703,
      1661415896452
    )
  ),
  uuid2 = data.frame(
    event = c("form start", rep("question", 5)),
    node = c("", paste0("/xx/question", 1:5)),
    start = c(1661415887295, 1661415887301, 1661415890819, 1661415892297, 1661415893529, 1661415894720),
    end = c(NA, 1661415890790, 1661415892273, 1661415893506, 1661415894703, 1661415896452)
  )
)

some_dataset <- data.frame(
  X_uuid = c("uuid1", "uuid2"),
  question1 = c("a", "b"),
  question2 = c("a", "b"),
  question3 = c("a", "b"),
  question4 = c("a", "b"),
  question5 = c("a", "b")
)

If you want to sum all the duration.

cleaningtools::add_duration_from_audit(some_dataset, 
                                       uuid_column = "X_uuid", 
                                       audit_list = list_audit)

##   X_uuid question1 question2 question3 question4 question5
## 1  uuid1         a         a         a         a         a
## 2  uuid2         b         b         b         b         b
##   duration_audit_sum_all_ms duration_audit_sum_all_minutes
## 1                      9058                            0.2
## 2                      9058                            0.2

If you want to use calculate duration between 2 questions.

cleaningtools::add_duration_from_audit(some_dataset,
  uuid_column = "X_uuid", audit_list = list_audit,
  start_question = "question1",
  end_question = "question3",
  sum_all = F
)

##   X_uuid question1 question2 question3 question4 question5
## 1  uuid1         a         a         a         a         a
## 2  uuid2         b         b         b         b         b
##   duration_audit_start_end_ms duration_audit_start_end_minutes
## 1                        6205                              0.1
## 2                        6205                              0.1

If you want to do both.

cleaningtools::add_duration_from_audit(some_dataset,
  uuid_column = "X_uuid", audit_list = list_audit,
  start_question = "question1",
  end_question = "question3",
  sum_all = T
)

##   X_uuid question1 question2 question3 question4 question5
## 1  uuid1         a         a         a         a         a
## 2  uuid2         b         b         b         b         b
##   duration_audit_sum_all_ms duration_audit_sum_all_minutes
## 1                      9058                            0.2
## 2                      9058                            0.2
##   duration_audit_start_end_ms duration_audit_start_end_minutes
## 1                        6205                              0.1
## 2                        6205                              0.1

1.2.3 checking the duration of the dataset

Once you have added the duration to the dataset, you can check if duration are between the threshold you are looking for.

testdata <- data.frame(
  uuid = c(letters[1:7]),
  duration_audit_start_end_ms = c(
    2475353, 375491, 2654267, 311585, 817270,
    2789505, 8642007
  ),
  duration_audit_start_end_minutes = c(41, 6, 44, 5, 14, 46, 144)
)

cleaningtools::check_duration(testdata, column_to_check = "duration_audit_start_end_minutes") |>
  head() |> 
  knitr::kable()

uuid	duration_audit_start_end_ms	duration_audit_start_end_minutes
a	2475353	41
b	375491	6
c	2654267	44
d	311585	5
e	817270	14
f	2789505	46
g	8642007	144

uuid	old_value	question	issue
b	6	duration_audit_start_end_minutes	Duration is lower or higher than the thresholds
d	5	duration_audit_start_end_minutes	Duration is lower or higher than the thresholds
e	14	duration_audit_start_end_minutes	Duration is lower or higher than the thresholds
g	144	duration_audit_start_end_minutes	Duration is lower or higher than the thresholds

cleaningtools::check_duration(
  testdata,
  column_to_check = "duration_audit_start_end_ms",
  lower_bound = 375490,
  higher_bound = 8642000) |>
  head() |> 
  knitr::kable()

uuid	duration_audit_start_end_ms	duration_audit_start_end_minutes
a	2475353	41
b	375491	6
c	2654267	44
d	311585	5
e	817270	14
f	2789505	46
g	8642007	144

uuid	old_value	question	issue
d	311585	duration_audit_start_end_ms	Duration is lower or higher than the thresholds
g	8642007	duration_audit_start_end_ms	Duration is lower or higher than the thresholds

testdata %>%
  cleaningtools::check_duration(column_to_check = "duration_audit_start_end_minutes") %>%
  check_duration(column_to_check = "duration_audit_start_end_ms",
                  log_name = "duration_in_ms",
                  lower_bound = 375490,
                  higher_bound = 8642000  ) |>
  head() |> 
  knitr::kable()

uuid	duration_audit_start_end_ms	duration_audit_start_end_minutes
a	2475353	41
b	375491	6
c	2654267	44
d	311585	5
e	817270	14
f	2789505	46
g	8642007	144

uuid	old_value	question	issue
b	6	duration_audit_start_end_minutes	Duration is lower or higher than the thresholds
d	5	duration_audit_start_end_minutes	Duration is lower or higher than the thresholds
e	14	duration_audit_start_end_minutes	Duration is lower or higher than the thresholds
g	144	duration_audit_start_end_minutes	Duration is lower or higher than the thresholds

uuid	old_value	question	issue
d	311585	duration_audit_start_end_ms	Duration is lower or higher than the thresholds
g	8642007	duration_audit_start_end_ms	Duration is lower or higher than the thresholds

1.3 Check outliers

check_outliers() takes raw data set and look for potential outlines. It can both data frame or list. However you must specify the element name (name of your data set in the given list) in element_name parameter!

set.seed(122)
### from list
df_outlier <- data.frame(
  uuid = paste0("uuid_", 1:100),
  one_value = c(round(runif(90, min = 45, max = 55)), round(runif(5)), round(runif(5, 99, 100))),
  expense = c(sample(200:500, replace = T, size = 95), c(600, 100, 80, 1020, 1050)),
  income = c(c(60, 0, 80, 1020, 1050), sample(20000:50000, replace = T, size = 95)),
  yy = c(rep(100, 99), 10)
)
outliers <- cleaningtools::check_outliers(dataset = df_outlier, uuid_column = "uuid")

## [1] "checking_one_value"
## [1] "checking_expense"
## [1] "checking_income"
## [1] "checking_yy"

outliers$potential_outliers |>head() |> knitr::kable()

uuid	issue	question	old_value
uuid_91	outlier (normal distribution)	one_value	1
uuid_92	outlier (normal distribution)	one_value	0
uuid_93	outlier (normal distribution)	one_value	0
uuid_94	outlier (normal distribution)	one_value	0
uuid_95	outlier (normal distribution)	one_value	0
uuid_96	outlier (normal distribution)	one_value	100

1.4 Check for value

check_value() function look for specified value in the given data set and return in a cleaning log format. The function can take a data frame or a list as input.

set.seed(122)

df <- data.frame(
  X_uuid = paste0("uuid_", 1:100),
  age = c(sample(18:80, replace = T, size = 96), 99, 99, 98, 88),
  gender = c("99", sample(c("male", "female"), replace = T, size = 95), "98", "98", "88", "888")
)

output <- cleaningtools::check_value(dataset = df, uuid_column = "X_uuid", element_name = "checked_dataset", values_to_look = c(99, 98, 88, 888))

output$flaged_value |>head() |> knitr::kable()

uuid	question	old_value	issue
uuid_1	gender	99	Possible value to be changed to NA
uuid_97	age	99	Possible value to be changed to NA
uuid_97	gender	98	Possible value to be changed to NA
uuid_98	age	99	Possible value to be changed to NA
uuid_98	gender	98	Possible value to be changed to NA
uuid_99	age	98	Possible value to be changed to NA

1.5 Check logics

check_logical() takes a regular expression, as it is how it will be read from Excel check_logical_with_list().

test_data <- data.frame(
  uuid = c(1:10) %>% as.character(),
  today = rep("2023-01-01", 10),
  location = rep(c("villageA", "villageB"), 5),
  distance_to_market = c(rep("less_30", 5), rep("more_30", 5)),
  access_to_market = c(rep("yes", 4), rep("no", 6)),
  number_children_05 = c(rep(c(0, 1), 4), 5, 6)
)
cleaningtools::check_logical(test_data,
  uuid_column = "uuid",
  check_to_perform = "distance_to_market == \"less_30\" & access_to_market == \"no\"",
  columns_to_clean = "distance_to_market, access_to_market",
  description = "distance to market less than 30 and no access"
) |>head() |> knitr::kable()

uuid	today	location	distance_to_market	access_to_market	number_children_05	logical_xx
1	2023-01-01	villageA	less_30	yes	0	FALSE
2	2023-01-01	villageB	less_30	yes	1	FALSE
3	2023-01-01	villageA	less_30	yes	0	FALSE
4	2023-01-01	villageB	less_30	yes	1	FALSE
5	2023-01-01	villageA	less_30	no	0	TRUE
6	2023-01-01	villageB	more_30	no	1	FALSE
7	2023-01-01	villageA	more_30	no	0	FALSE
8	2023-01-01	villageB	more_30	no	1	FALSE
9	2023-01-01	villageA	more_30	no	5	FALSE
10	2023-01-01	villageB	more_30	no	6	FALSE

uuid	question	old_value	issue	check_id	check_binding
5	distance_to_market	less_30	distance to market less than 30 and no access	logical_xx	logical_xx _/ 5
5	access_to_market	no	distance to market less than 30 and no access	logical_xx	logical_xx _/ 5

test_data <- data.frame(
  uuid = c(1:10) %>% as.character(),
  distance_to_market = rep(c("less_30", "more_30"), 5),
  access_to_market = c(rep("yes", 4), rep("no", 6)),
  number_children_05 = c(rep(c(0, 1), 4), 5, 6),
  number_children_618 = c(rep(c(0, 1), 4), 5, 6)
)

check_list <- data.frame(
  name = c("logical_xx", "logical_yy", "logical_zz"),
  check = c(
    "distance_to_market == \"less_30\" & access_to_market == \"no\"",
    "number_children_05 > 3",
    "rowSums(dplyr::across(starts_with(\"number\")), na.rm = T) > 9"
  ),
  description = c(
    "distance to market less than 30 and no access",
    "number of children under 5 seems high",
    "number of children very high"
  ),
  variables_to_clean = c(
    "distance_to_market, access_to_market",
    "number_children_05",
    ""
  )
)
cleaningtools::check_logical_with_list(test_data,
  uuid_column = "uuid",
  list_of_check = check_list,
  check_id_column = "name",
  check_to_perform_column = "check",
  columns_to_clean_column = "variables_to_clean",
  description_column = "description"
) |>head() |> knitr::kable()

## Warning in check_logical(dataset = dataset, uuid_column = uuid_column,
## information_to_add = information_to_add, : columns_to_clean not shared, results
## may not be accurate

uuid	distance_to_market	access_to_market	number_children_05	number_children_618	logical_xx	logical_yy	logical_zz
1	less_30	yes	0	0	FALSE	FALSE	FALSE
2	more_30	yes	1	1	FALSE	FALSE	FALSE
3	less_30	yes	0	0	FALSE	FALSE	FALSE
4	more_30	yes	1	1	FALSE	FALSE	FALSE
5	less_30	no	0	0	TRUE	FALSE	FALSE
6	more_30	no	1	1	FALSE	FALSE	FALSE
7	less_30	no	0	0	TRUE	FALSE	FALSE
8	more_30	no	1	1	FALSE	FALSE	FALSE
9	less_30	no	5	5	TRUE	TRUE	TRUE
10	more_30	no	6	6	FALSE	TRUE	TRUE

uuid	question	old_value	issue	check_id	check_binding
5	distance_to_market	less_30	distance to market less than 30 and no access	logical_xx	logical_xx _/ 5
5	access_to_market	no	distance to market less than 30 and no access	logical_xx	logical_xx _/ 5
7	distance_to_market	less_30	distance to market less than 30 and no access	logical_xx	logical_xx _/ 7
7	access_to_market	no	distance to market less than 30 and no access	logical_xx	logical_xx _/ 7
9	distance_to_market	less_30	distance to market less than 30 and no access	logical_xx	logical_xx _/ 9
9	access_to_market	no	distance to market less than 30 and no access	logical_xx	logical_xx _/ 9
9	number_children_05	5	number of children under 5 seems high	logical_yy	logical_yy _/ 9
10	number_children_05	6	number of children under 5 seems high	logical_yy	logical_yy _/ 10
9	unable to identify	please check this uuid for this check	number of children very high	logical_zz	logical_zz _/ 9
10	unable to identify	please check this uuid for this check	number of children very high	logical_zz	logical_zz _/ 10

1.6 Check for duplicates

1.6.1 With one or several variables

testdata <- data.frame(
  uuid = c(letters[1:4], "a", "b", "c"),
  col_a = runif(7),
  col_b = runif(7)
)
cleaningtools::check_duplicate(testdata)

## $checked_dataset
##   uuid     col_a     col_b
## 1    a 0.9166284 0.2958878
## 2    b 0.3212423 0.6047783
## 3    c 0.2052720 0.8932896
## 4    d 0.1512732 0.1883179
## 5    a 0.8096267 0.3822435
## 6    b 0.5861553 0.6640918
## 7    c 0.9902022 0.7932066
## 
## $duplicate_log
##   uuid old_value question           issue
## 1    a         a     uuid duplicated uuid
## 2    b         b     uuid duplicated uuid
## 3    c         c     uuid duplicated uuid

Or you can check duplicate for a specific variable or combination of variables.

testdata2 <- data.frame(
  uuid = letters[c(1:7)],
  village = paste("village", c(1:3, 1:3, 4)),
  ki_identifier = paste0("xx_", c(1:5, 3, 4))
)
check_duplicate(testdata2, columns_to_check = "village")

## $checked_dataset
##   uuid   village ki_identifier
## 1    a village 1          xx_1
## 2    b village 2          xx_2
## 3    c village 3          xx_3
## 4    d village 1          xx_4
## 5    e village 2          xx_5
## 6    f village 3          xx_3
## 7    g village 4          xx_4
## 
## $duplicate_log
## # A tibble: 3 × 4
##   uuid  question old_value issue             
##   <chr> <chr>    <chr>     <glue>            
## 1 d     village  village 1 duplicated village
## 2 e     village  village 2 duplicated village
## 3 f     village  village 3 duplicated village

check_duplicate(testdata2, columns_to_check = c("village", "ki_identifier"))

## $checked_dataset
##   uuid   village ki_identifier
## 1    a village 1          xx_1
## 2    b village 2          xx_2
## 3    c village 3          xx_3
## 4    d village 1          xx_4
## 5    e village 2          xx_5
## 6    f village 3          xx_3
## 7    g village 4          xx_4
## 
## $duplicate_log
## # A tibble: 2 × 4
##   uuid  question      old_value issue                               
##   <chr> <chr>         <chr>     <glue>                              
## 1 f     village       village 3 duplicated village ~/~ ki_identifier
## 2 f     ki_identifier xx_3      duplicated village ~/~ ki_identifier

1.6.2 With the gower distance (soft duplicates)

To use it with the complete dataset:

soft_duplicates <- check_soft_duplicates(
  dataset = cleaningtools_raw_data,
  kobo_survey = cleaningtools_survey,
  uuid_column = "X_uuid",
  idnk_value = "dont_know",
  sm_separator = ".",
  log_name = "soft_duplicate_log",
  threshold = 7
)

soft_duplicates[["soft_duplicate_log"]] %>%head() |> knitr::kable()

uuid	issue

soft_duplicates <- check_soft_duplicates(
  dataset = cleaningtools_raw_data,
  kobo_survey = cleaningtools_survey,
  uuid_column = "X_uuid",
  idnk_value = "dont_know",
  sm_separator = ".",
  log_name = "soft_duplicate_log",
  threshold = 7, 
  return_all_results = TRUE
)

soft_duplicates[["soft_duplicate_log"]] %>%head() |> knitr::kable()

uuid	num_cols_not_NA	total_columns_compared	id_most_similar_survey	number_different_columns	issue
3370f726-395a-4675-94fe-9e745e0b36e9	75	148	dc7bf25b-e18b-4b9e-bb34-5d7a1e762eb2	9	NA
93095da3-5291-4d16-a19a-41bf13144bfe	85	148	db5e05db-94e9-44aa-9206-3e1c17a7a233	9	NA
db5e05db-94e9-44aa-9206-3e1c17a7a233	85	148	93095da3-5291-4d16-a19a-41bf13144bfe	9	NA
dc7bf25b-e18b-4b9e-bb34-5d7a1e762eb2	75	148	3370f726-395a-4675-94fe-9e745e0b36e9	9	NA
0858486a-1d3d-492b-863f-b050cb9fe7af	74	148	93893e39-9c82-4e19-b480-4dc78033157b	10	NA
193d5f36-93b9-4c97-9205-13aa7e3a6c7f	75	148	ac42b381-4d3b-42b4-96fb-676d43a8c4e7	10	NA

To use it grouping by enumerator:

group_by_enum_raw_data <- cleaningtools_raw_data %>%
  dplyr::group_by(enumerator_num)
soft_per_enum <- group_by_enum_raw_data %>%
  dplyr::group_split() %>%
  purrr::map(~ check_soft_duplicates(
    dataset = .,
    kobo_survey = cleaningtools_survey,
    uuid_column = "X_uuid", idnk_value = "dont_know",
    sm_separator = ".",
    log_name = "soft_duplicate_log",
    threshold = 7, 
    return_all_results = TRUE
  ))
soft_per_enum %>%
  purrr::map(~ .[["soft_duplicate_log"]]) %>%
  purrr::map2(
    .y = dplyr::group_keys(group_by_enum_raw_data) %>% unlist(),
    ~ dplyr::mutate(.x, enum = .y)
  ) %>%
  do.call(dplyr::bind_rows, .) %>%
 head() |> knitr::kable()

uuid	num_cols_not_NA	total_columns_compared	num_cols_dont_know	id_most_similar_survey	number_different_columns	issue	enum
44614627-c152-4f24-a3ca-87a58b2f2e3f	65	118	2	a537a7a3-468c-4661-8b7b-93e43e9b8a3b	17	NA	1
a537a7a3-468c-4661-8b7b-93e43e9b8a3b	63	118	1	44614627-c152-4f24-a3ca-87a58b2f2e3f	17	NA	1
4cf2c1c2-75a9-4be1-ab1a-09e0b0bec0bd	87	118	2	b6dc0988-15d1-49c6-859f-b0e63c18485b	18	NA	1
b6dc0988-15d1-49c6-859f-b0e63c18485b	86	118	2	4cf2c1c2-75a9-4be1-ab1a-09e0b0bec0bd	18	NA	1
55e388ed-fdf2-4d53-96b8-8c406947cad3	64	118	4	a537a7a3-468c-4661-8b7b-93e43e9b8a3b	19	NA	1
38d2f047-3036-458b-95fb-fc86f5b7924e	84	118	1	48f2f69f-4958-4180-8f31-e3eb3dedc28d	22	NA	1

1.7 Check the food consumption score

The check_fcs() function verifies whether all the food consumption components have identical values or not. It will flag the UUIDs where all the values are the same.

cleaningtools::check_fcs(
  dataset = cleaningtools::cleaningtools_food_consumption_df,
  uuid_column = "X_uuid",
  cereals_column = "cereals_grains_roots_tubers",
  pulses_column = "beans_legumes_pulses_nuts",
  dairy_column = "milk_dairy_products",
  meat_column = "meat_fish_eggs",
  vegetables_column = "vegetables",
  fruits_column = "fruite",
  oil_column = "oil_fat_butter",
  sugar_column = "sugar_sugary_food"
) |>head() |> knitr::kable()

## Warning in cleaningtools::check_fcs(dataset =
## cleaningtools::cleaningtools_food_consumption_df, : Potential issue ! There are
## 105 observations where all the variables of food consumption score are the
## same.Check result.

X_uuid	cereals_grains_roots_tubers	beans_legumes_pulses_nuts	milk_dairy_products	meat_fish_eggs	vegetables	fruite	oil_fat_butter	sugar_sugary_food	issue
e7da37c0-dd23-4d38-8cac-2e8e8a243b57	0	0	0	0	0	0	0	0	All the vlaues of of food consumption variables are the same
f6b056c8-bda7-43d2-a723-582dc8001265	7	7	7	7	7	7	7	7	All the vlaues of of food consumption variables are the same
c8a00acf-1626-42f8-8c8f-592aa3c9e688	1	1	1	1	1	1	1	1	All the vlaues of of food consumption variables are the same
911204db-f11d-451b-9cd4-44cb76635611	3	3	3	3	3	3	3	3	All the vlaues of of food consumption variables are the same
3f6ce5f7-5afe-4bd9-91ab-03c1b81a70ed	7	7	7	7	7	7	7	7	All the vlaues of of food consumption variables are the same
cc002ebb-a3e5-4b6b-81ae-cbd9c9b047e8	1	1	1	1	1	1	1	1	All the vlaues of of food consumption variables are the same

1.8 Check others values

The check_others() function generate a log for other follow up questions

output <- cleaningtools::check_others(
  dataset = cleaningtools::cleaningtools_clean_data,
  uuid_column = "X_uuid",
  columns_to_check = names(cleaningtools::cleaningtools_clean_data |>
    dplyr::select(ends_with("_other")) |>
    dplyr::select(-contains(".")))
)

output$other_log |>head() |> knitr::kable()

uuid	question	old_value	issue
630d0067-d84a-4fd0-8c36-029e87913c40	primary_livelihood_other	عسكري ضمن صفوف الداخليه	recode other
ac98cb2b-80a9-4ee9-ba67-6fd38b0247d8	problems_water_main_complaint_other	عدم وصول شبكة الماء لهذا للحي	recode other
c18fc8f9-c8b7-4ba3-8e61-4460c3604a9d	problems_water_main_complaint_other	عدم وصول شبكة الماء لهذا الحي	recode other
816f41ec-603e-4d5d-a284-8bdb9cbf913d	problems_water_main_complaint_other	عدم وجود شبكة ماء لهذا الحي	recode other
0be815d6-1f31-4d4d-8775-6043df8b9f03	problems_water_main_complaint_other	عدم وجود شبكة ماء	recode other
b4588be7-aa42-47de-9d00-e85ec968c1fa	problems_water_main_complaint_other	عدم وجود شبكة ماء لهذه المنطقة	recode other

1.9 Check percentage of missing values.

1.9.1 Add the percentage missing

The add_percentage_missing() adds the percentage of missing values per row.

data_example <- data.frame(
  uuid = letters[1:3],
  col_1 = c(1, NA, 3),
  col_2 = c(NA, NA, "expenditures"),
  col_3 = c("with need", NA, "with need"),
  col_4 = c("food health school", NA, "food"),
  col_4.food = c(1, NA, 1),
  col_4.health = c(1, NA, 0),
  col_4.school = c(1, NA, 0)
)

data_example <- data_example %>% cleaningtools::add_percentage_missing()
data_example |>head() |> knitr::kable()

uuid	col_1	col_2	col_3	col_4	col_4.food	col_4.health	col_4.school	percentage_missing
a	1	NA	with need	food health school	1	1	1	0.125
b	NA	NA	NA	NA	NA	NA	NA	0.875
c	3	expenditures	with need	food	1	0	0	0.000

1.9.2 add_percentage_missing()

The add_percentage_missing() function will flag if a survey for its missing values. The missing values column can be created with add_percentage_missing and the values are flagged with check_outliers.

data_example %>%
  cleaningtools::check_percentage_missing() |>
 head() |> knitr::kable()

## [1] "checking_percentage_missing"

uuid	col_1	col_2	col_3	col_4	col_4.food	col_4.health	col_4.school	percentage_missing
a	1	NA	with need	food health school	1	1	1	0.125
b	NA	NA	NA	NA	NA	NA	NA	0.875
c	3	expenditures	with need	food	1	0	0	0.000

uuid	issue	question	old_value

2. Creation of the cleanng log

2.1 create_combined_log()

The function create_combined_log() takes the cleaning logs as input and returns a list with two elements: the dataset and the combined cleaning log.

list_log <- cleaningtools::cleaningtools_raw_data |>
  check_pii(uuid_column = "X_uuid") |>
  check_duplicate(uuid_column = "X_uuid") |>
  check_value(uuid_column = "X_uuid") |>
  create_combined_log()

## List of element to combine- checked_dataset, potential_PII, duplicate_log, flaged_value

list_log$cleaning_log |> head(6)

## # A tibble: 6 × 7
##   uuid              question issue old_value change_type new_value check_binding
##   <chr>             <chr>    <chr> <chr>     <chr>       <chr>     <chr>        
## 1 all               neighbo… Pote… NA        NA          NA        neighbourhoo…
## 2 all               water_s… Pote… NA        NA          NA        water_supply…
## 3 all               water_s… Pote… NA        NA          NA        water_supply…
## 4 all               water_s… Pote… NA        NA          NA        water_supply…
## 5 all               consent… Pote… NA        NA          NA        consent_tele…
## 6 ac26e24d-12be-47… X_index  Poss… 88        NA          NA        X_index ~/~ …

2.2 `add_info_to_cleaning_log()`

The function add_info_to_cleaning_log() is designed to add information from the dataset into the cleaning log.

add_with_info <- list_log |> add_info_to_cleaning_log(dataset_uuid_column = "X_uuid")

add_with_info$cleaning_log |>head() |> knitr::kable()

uuid	question	issue	old_value	change_type	new_value	check_binding	enumerator_num	date_assessment
ac26e24d-12be-4729-bae7-21060ee00a28	X_index	Possible value to be changed to NA	88	NA	NA	X_index _/ ac26e24d-12be-4729-bae7-21060ee00a28	13	2021-07-06
all	neighbourhood	Potential PII	NA	NA	NA	neighbourhood _/ all	NA	NA
all	water_supply_rest_neighbourhood	Potential PII	NA	NA	NA	water_supply_rest_neighbourhood _/ all	NA	NA
all	water_supply_other_neighbourhoods	Potential PII	NA	NA	NA	water_supply_other_neighbourhoods _/ all	NA	NA
all	water_supply_other_neighbourhoods_why	Potential PII	NA	NA	NA	water_supply_other_neighbourhoods_why _/ all	NA	NA
all	consent_telephone_number	Potential PII	NA	NA	NA	consent_telephone_number _/ all	NA	NA

2.3 `create_xlsx_cleaning_log()`

The function add_info_to_cleaning_log() is designed to add information from the dataset into the cleaning log.

add_with_info |>
  create_xlsx_cleaning_log(
    kobo_survey = cleaningtools_survey,
    kobo_choices = cleaningtools_choices,
    use_dropdown = TRUE,
    output_path = "mycleaninglog.xlsx"
  )

3. Clean the data using the cleaning log

We are creating a dataset and cleaning log for the example

test_data <- data.frame(
  uuid = paste0("uuid", 1:4),
  age = c(180, 23, 45, 67),
  gender = c("male", "female", "male", "female"),
  pop_group = c("idp", "refugee", "host", "idp"),
  strata = c("a", "b", "c", "d")
)
test_data

##    uuid age gender pop_group strata
## 1 uuid1 180   male       idp      a
## 2 uuid2  23 female   refugee      b
## 3 uuid3  45   male      host      c
## 4 uuid4  67 female       idp      d

cleaning_log_test <- data.frame(
  uuid = paste0("uuid", 1:4),
  question = c("age", "gender", "pop_group", "strata"),
  change_type = c("blank_response", "no_change", "Delete", "change_res"),
  new_value = c(NA_character_, NA_character_, NA_character_, "st-a")
)

cleaning_log_test

##    uuid  question    change_type new_value
## 1 uuid1       age blank_response      <NA>
## 2 uuid2    gender      no_change      <NA>
## 3 uuid3 pop_group         Delete      <NA>
## 4 uuid4    strata     change_res      st-a

3.1 Check the cleaning log

After obtaining both the cleaning log and dataset, it is considered good practice to utilize the review_cleaning_log() function to ensure the consistency between the cleaning log and the dataset. It is highly recommended to perform this check on a daily basis, enabling you to promptly identify any issues right from the outset.

cleaningtools::review_cleaning_log(
  raw_dataset = test_data,
  raw_data_uuid_column = "uuid",
  cleaning_log = cleaning_log_test,
  cleaning_log_change_type_column = "change_type",
  change_response_value = "change_res",
  cleaning_log_question_column = "question",
  cleaning_log_uuid_column = "uuid",
  cleaning_log_new_value_column = "new_value"
)

## [1] "no issues in cleaning log found"

3.2 Create the clean data from the raw data and cleaning log

Once you have a perfect cleaning log and the raw dataset, you can create clean data by applyingcreate_clean_data() function.

cleaningtools::create_clean_data(
  raw_dataset = test_data,
  raw_data_uuid_column = "uuid",
  cleaning_log = cleaning_log_test,
  cleaning_log_change_type_column = "change_type",
  change_response_value = "change_res",
  NA_response_value = "blank_response",
  no_change_value = "no_change",
  remove_survey_value = "Delete",
  cleaning_log_question_column = "question",
  cleaning_log_uuid_column = "uuid",
  cleaning_log_new_value_column = "new_value"
)

## [1] "age"
## [1] "strata"

##    uuid age gender pop_group strata
## 1 uuid1  NA   male       idp      a
## 2 uuid2  23 female   refugee      b
## 3 uuid4  67 female       idp   st-a

3.3 Recreate parent column for choice multiple

recreate_parent_column() recreates the concerted columns for select multiple questions

test_data <- dplyr::tibble(
  uuid = paste0("uuid_", 1:6),
  gender = rep(c("male", "female"), 3),
  reason = c(
    "xx,yy", "xx,zy",
    "zy", "xx,xz,zy",
    NA_character_, "xz"
  ),
  reason.x.x. = c(0, 1, 0, 1, 0, 0),
  reason.yy = c(1, 0, 0, 0, 1, 0),
  reason.x.z = c(0, 0, 0, 1, 0, 1),
  reason.zy = c(0, 1, 1, 1, 0, 0),
  reason_zy = c(NA_character_, "A", "B", "C", NA_character_, NA_character_)
)

cleaningtools::recreate_parent_column(dataset = test_data, uuid_column = "uuid", sm_separator = ".") |>head() |> knitr::kable()

## Warning in cleaningtools::recreate_parent_column(dataset = test_data,
## uuid_column = "uuid", : Column(s) names are renamed as multiple separators are
## found in dataset column names. Please see the above table with the new name.

## # A tibble: 2 × 2
##   old_name    new_name   
##   <chr>       <chr>      
## 1 reason.x.x. reason.x_x_
## 2 reason.x.z  reason.x_z

## gender

## reason.yy

## reason.zy

## reason_zy

## gender

## reason.yy

## reason.zy

## reason_zy

## gender

## reason

## reason.yy

## reason.zy

## reason_zy

uuid	gender	reason	reason.x_x_	reason.yy	reason.x_z	reason.zy	reason_zy
uuid_1	male	yy	0	1	0	0	NA
uuid_2	female	x_x_ zy	1	0	0	1	A
uuid_3	male	zy	0	0	0	1	B
uuid_4	female	x_x_ x_z zy	1	0	1	1	C
uuid_5	male	yy	0	1	0	0	NA
uuid_6	female	x_z	0	0	1	0	NA

uuid	question	change_type	new_value	old_value	comment
all	reason.x.x.	variable_removed	NA	NA	variable removed from the clean dataset
all	reason.x.z	variable_removed	NA	NA	variable removed from the clean dataset
all	reason.x_x_	variable_added	NA	NA	variable added to the clean dataset
all	reason.x_z	variable_added	NA	NA	variable added to the clean dataset
uuid_1	reason	change_response	yy	xx,yy	Parent column changed to match children columns
uuid_2	reason	change_response	x_x_ zy	xx,zy	Parent column changed to match children columns
uuid_4	reason	change_response	x_x_ x_z zy	xx,xz,zy	Parent column changed to match children columns
uuid_6	reason	change_response	x_z	xz	Parent column changed to match children columns
uuid_5	reason	change_response	yy	NA	NA changed to value

4. Review of the cleaning

4.1 Review cleaning log with clean data and raw data

review_cleaning function takes raw data, clean data and cleaning log as inputs, and it first creates the cleaning log by comparing raw data and clean data, then compares it with the user-provided cleaning log. Finally, flagged the discrepancies between them (if any).

# compared_df <- review_cleaning(
#   raw_dataset = cleaningtools::cleaningtools_raw_data,
#   raw_dataset_uuid_column = "X_uuid",
#   clean_dataset = cleaningtools::cleaningtools_clean_data,
#   clean_dataset_uuid_column = "X_uuid",
#   cleaning_log = cleaning_log2,
#   cleaning_log_uuid_column = "X_uuid",
#   cleaning_log_question_column = "questions",
#   cleaning_log_new_value_column = "new_value",
#   cleaning_log_old_value_column = "old_value",
#   deletion_log = deletion_log,
#   deletion_log_uuid_column = "X_uuid",
#   check_for_deletion_log = T
# )
# 
# compared_df |>head() |> knitr::kable()

4.2 Example `review_others()`

The review_others() function reviews discrepancy between kobo relevancies and the dataset

review_others(
  dataset = cleaningtools::cleaningtools_clean_data,
  uuid_column = "X_uuid", kobo_survey = cleaningtools::cleaningtools_survey
) |>head() |> knitr::kable()

uuid	question	old_value	issue	check_id	check_binding
f58a7fda-27e8-4003-90b3-479bebbb99ab	consent_telephone_number	yes	consent_telephone_number is selected but telephone_number is not found in the dataset	id- 27	id- 27 _/ f58a7fda-27e8-4003-90b3-479bebbb99ab
956b5ed0-5a62-41b7-aec3-af93fbc5b494	consent_telephone_number	yes	consent_telephone_number is selected but telephone_number is not found in the dataset	id- 27	id- 27 _/ 956b5ed0-5a62-41b7-aec3-af93fbc5b494
3413afd2-8f05-4a6e-8ec7-5d64dc8fea23	consent_telephone_number	yes	consent_telephone_number is selected but telephone_number is not found in the dataset	id- 27	id- 27 _/ 3413afd2-8f05-4a6e-8ec7-5d64dc8fea23
630d0067-d84a-4fd0-8c36-029e87913c40	consent_telephone_number	yes	consent_telephone_number is selected but telephone_number is not found in the dataset	id- 27	id- 27 _/ 630d0067-d84a-4fd0-8c36-029e87913c40
2cd180e8-7f2b-460b-82b5-fb9d163f8e7b	consent_telephone_number	yes	consent_telephone_number is selected but telephone_number is not found in the dataset	id- 27	id- 27 _/ 2cd180e8-7f2b-460b-82b5-fb9d163f8e7b
929d60f8-ed9c-4c42-9ee1-cb4a67e4ba27	consent_telephone_number	yes	consent_telephone_number is selected but telephone_number is not found in the dataset	id- 27	id- 27 _/ 929d60f8-ed9c-4c42-9ee1-cb4a67e4ba27

4.3 Example `review_sample_frame_with_dataset()`

review_sample_frame_with_dataset() compares the sample frame with dataset and provide the overview of completed and remaining surveys.

review_output <- cleaningtools::review_sample_frame_with_dataset(
  sample_frame = cleaningtools::cleaningtools_sample_frame,
  sampling_frame_strata_column = "Neighbourhood",
  sampling_frame_target_survey_column = "Total.no.of.HH",
  clean_dataset = cleaningtools::cleaningtools_clean_data,
  clean_dataset_strata_column = "neighbourhood",
  consent_column = "consent_remote",
  consent_yes_value = "yes"
)
review_output |>head() |> knitr::kable()

Managed.by	Governorate	Neighbourhood	Total.no.of.HH	Collected
Talafar	Ninewa	A1	22	22
Talafar	Ninewa	A2	19	19
Talafar	Ninewa	A3	5	5
Talafar	Ninewa	A4	6	6
Talafar	Ninewa	B1	12	12
Talafar	Ninewa	B2	15	15