# Load the four data files and only keep the useful columns
prolific_export = read_csv(paste0(datapath, "prolific_export.csv")) %>%
dplyr::select(participant_id,
entered_code)
results = read_csv(paste0(datapath, "results.csv")) %>%
dplyr::select(participant_id,
starts_with("timestamp"))
excluded = read_csv(paste0(datapath, "excluded.csv")) %>%
dplyr::select(participant_id,
failed_attention_check,
reloaded) %>%
mutate(participant_id = as.character(participant_id)) # necessary if all are NAs
agreed = read_csv(paste0(datapath, "all_who_agreed.csv")) %>%
dplyr::select(participant_id,
condition,
design,
identity)
# Join the four datasets and clean up a bit
data_all = agreed %>%
left_join(excluded, by = "participant_id") %>%
left_join(prolific_export, by = "participant_id") %>%
full_join(results, by = "participant_id") %>%
subset(!participant_id %in% exclude_participant_ids) %>%
mutate(failed_attention_check = coalesce(failed_attention_check, 0), # replace NAs with zeros
reloaded = coalesce(reloaded, 0)) %>%
mutate(
identity = case_when(
identity == "NonAnonymous" ~ "Named",
identity == "Anonymous" ~ identity
),
design = case_when(
design == "Anthropographic" ~ "Rich",
design == "NonAnthropographic" ~ "Poor"
),
condition = paste(identity, design, sep = "_")
)
# Clean up possibly serious anomalies in the data and issue warnings if necessary
data_all_original = data_all
data_all = data_all %>%
drop_na(participant_id, condition) %>%
distinct(participant_id, .keep_all= TRUE)
if (nrow(data_all) != nrow(data_all_original)) {
cat("WARNING -- Found duplicate participant IDs or data with missing participant ID or missing condition. This data was discarded:")
anti_join(data_all_original, data_all)
}
# Create another dataset with completed submissions only
data = data_all %>% dplyr::filter(entered_code == completion_code)
sequence = c("important", "consent", "description", "vis", "donation", "affect", "attention")
# Transforms data to analyze the timestamps
timestamp_data = data %>%
mutate(
ts_important = timestamp_1 - timestamp_0,
ts_consent = timestamp_2 - timestamp_1,
ts_description = timestamp_3 - timestamp_2,
ts_vis = timestamp_4 - timestamp_3,
ts_donation = timestamp_5 - timestamp_4,
ts_affect = timestamp_6 - timestamp_5,
ts_attention = timestamp_7 - timestamp_6
) %>%
mutate_at(vars(starts_with("ts")), function(d) d / 1000) %>%
select(starts_with("ts"), condition) %>%
melt(id.vars = c("condition")) %>%
rename(page = variable,
time = value) %>%
mutate(page = factor(str_remove(page, "ts_"), levels = sequence, ordered = T))
## WARNING -- Found duplicate participant IDs or data with missing participant ID or missing condition. This data was discarded:
basic_elements_layer = list(
labs(x = "Experiment Pages",
y = "Time (s)",
color = "Condition"),
scale_color_brewer(palette = "Set2"),
theme(axis.text.x = element_text(angle = 30, color = "gray40"))
)
Overview of time spent on every page
timestamp_data %>%
group_by(page) %>%
mutate(median_time = median(time, na.rm = T)) %>%
ggplot(aes(x = page,
y = time)) +
geom_jitter(alpha = .2, width = .1, color = "orangered") +
geom_point(aes(y = median_time),
color = "gold",
size = 2) +
basic_elements_layer
Time distribution between conditions
timestamp_data %>%
ggplot(aes(
x = page,
y = time,
color = condition
)) +
geom_boxplot(outlier.color = NA) +
scale_y_continuous(limits = c(0, 175)) +
basic_elements_layer +
theme(legend.position = "top")
Median time on each page by condition
timestamp_data %>%
group_by(condition, page) %>%
summarise(median_time = median(time, na.rm = T)) %>%
ggplot(aes(
x = page,
y = median_time,
color = condition,
group = condition
)) +
geom_point() +
geom_line() +
basic_elements_layer +
ylab("Median Time (s)") +
theme(legend.position = "top")