# Load the four data files and only keep the useful columns
prolific_export = read_csv(paste0(datapath, "prolific_export.csv")) %>%
         dplyr::select(participant_id,
         entered_code)

results = read_csv(paste0(datapath, "results.csv")) %>%
  dplyr::select(participant_id,
         starts_with("timestamp"))

excluded = read_csv(paste0(datapath, "excluded.csv")) %>%
  dplyr::select(participant_id,
         failed_attention_check,
         reloaded) %>%
  mutate(participant_id = as.character(participant_id)) # necessary if all are NAs 

agreed = read_csv(paste0(datapath, "all_who_agreed.csv")) %>%
  dplyr::select(participant_id,
         condition,
         design,
         identity)

# Join the four datasets and clean up a bit
data_all = agreed %>%
  left_join(excluded, by = "participant_id") %>%
  left_join(prolific_export, by = "participant_id") %>%
  full_join(results, by = "participant_id") %>%
  subset(!participant_id %in% exclude_participant_ids) %>% 
  mutate(failed_attention_check = coalesce(failed_attention_check, 0), # replace NAs with zeros
         reloaded = coalesce(reloaded, 0)) %>%
  mutate(
    identity = case_when(
      identity == "NonAnonymous" ~ "Named",
      identity == "Anonymous" ~ identity
    ),
    design = case_when(
      design == "Anthropographic" ~ "Rich",
      design == "NonAnthropographic" ~ "Poor"
    ),
    condition = paste(identity, design, sep = "_")
  )

# Clean up possibly serious anomalies in the data and issue warnings if necessary
data_all_original = data_all
data_all = data_all %>%
  drop_na(participant_id, condition) %>%
  distinct(participant_id, .keep_all= TRUE)
if (nrow(data_all) != nrow(data_all_original)) {
  cat("WARNING -- Found duplicate participant IDs or data with missing participant ID or missing condition. This data was discarded:")
  anti_join(data_all_original, data_all)
}

# Create another dataset with completed submissions only
data = data_all %>% dplyr::filter(entered_code == completion_code)

sequence = c("important", "consent", "description", "vis", "donation", "affect", "attention")

# Transforms data to analyze the timestamps
timestamp_data = data %>%
  mutate(
    ts_important = timestamp_1 - timestamp_0,
    ts_consent = timestamp_2 - timestamp_1,
    ts_description = timestamp_3 - timestamp_2,
    ts_vis = timestamp_4 - timestamp_3,
    ts_donation = timestamp_5 - timestamp_4,
    ts_affect = timestamp_6 - timestamp_5,
    ts_attention = timestamp_7 - timestamp_6
  ) %>%
  mutate_at(vars(starts_with("ts")), function(d) d / 1000) %>%
  select(starts_with("ts"), condition) %>%
  melt(id.vars = c("condition")) %>%
  rename(page = variable,
         time = value) %>%
  mutate(page = factor(str_remove(page, "ts_"), levels = sequence, ordered = T))
## WARNING -- Found duplicate participant IDs or data with missing participant ID or missing condition. This data was discarded:
basic_elements_layer = list(
  labs(x = "Experiment Pages",
       y = "Time (s)",
       color = "Condition"),
  scale_color_brewer(palette = "Set2"),
  theme(axis.text.x = element_text(angle = 30, color = "gray40"))
)

0.1 Overview of time spent on every page

timestamp_data %>%
  group_by(page) %>%
  mutate(median_time = median(time, na.rm = T)) %>%
  ggplot(aes(x = page,
             y = time)) +
  geom_jitter(alpha = .2, width = .1, color = "orangered") +
  geom_point(aes(y = median_time),
             color = "gold",
             size = 2) +
  basic_elements_layer

0.2 Time distribution between conditions

timestamp_data %>%
  ggplot(aes(
    x = page,
    y = time,
    color = condition
  )) +
  geom_boxplot(outlier.color = NA) +
  scale_y_continuous(limits = c(0, 175)) +
  basic_elements_layer +
  theme(legend.position = "top")

0.3 Median time on each page by condition

timestamp_data %>%
  group_by(condition, page) %>%
  summarise(median_time = median(time, na.rm = T)) %>%
  ggplot(aes(
    x = page,
    y = median_time,
    color = condition,
    group = condition
  )) +
  geom_point() +
  geom_line() +
  basic_elements_layer +
  ylab("Median Time (s)") +
  theme(legend.position = "top")