library(forcats) # Factor reordering
library(ggplot2) # Plots
library(data.table)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(geodist)
library(stopdetection)
library(topdowntimeratio)
## 
## Attaching package: 'topdowntimeratio'
## The following object is masked from 'package:stopdetection':
## 
##     radiusOfGyrationDT
source("utils.R")
source("dtwbi_functions.R")
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loaded dtw v1.22-3. See ?dtw for help, citation("dtw") for use in publication.
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms
knitr::opts_chunk$set(cache = FALSE, warning = FALSE, 
                      message = FALSE, cache.lazy = FALSE)

Data cleaning

Loading

Raw location data collected from respondents was used. All entries with an accuracy of 200 or higher were removed.

# library(stopdetection)

source("~/cbs_share/projecten/BPM/301707WaarnVnw_SEC1/Werk/Tabi Verplaatsingen App/R code Danielle McCool/New code suite/file_reading_functions.R")
source("~/cbs_share/projecten/BPM/301707WaarnVnw_SEC1/Werk/Tabi Verplaatsingen App/R code Danielle McCool/New code suite/file_merging_functions.R")
source("~/cbs_share/projecten/BPM/301707WaarnVnw_SEC1/Werk/Tabi Verplaatsingen App/R code Danielle McCool/New code suite/file_cleaning_functions.R")

# Set up reference data ---------------------------------------------------

data <- readInData(getNewFilenames(as.if.day = "2018-12-15"))

# Remove unneessary columns
data[, created_at := NULL]
data[, id := NULL]
data[, distance_between_previous_position := NULL]

# Remove duplicates
data <- copy(unique(data, by = c("device_id", "latitude", "longitude", "timestamp")))

# Remove data from cell towers
data <- data[accuracy < 200]

# We have to do the ordering a number of times
setorder(data, device_id, timestamp)


# These are all standard data loading I've used for two years
devices   <- readInData(getNewFilenames(type = "device", as.if.day = "2018-12-16"))
users     <- readInData(getNewFilenames(type = "user", as.if.day = "2018-12-16"))
data      <- mergeDataDevices(data, devices)
data      <- mergeDataUsers(data, users)
data      <- keepParticipants(data)
data <- data[!is.na(device_id)]
data <- data[!is.na(timestamp)]

data[, c("altitude", "speed", "desired_accuracy", "heading") := list(NULL)]
## Timestamp duplications
data[!duplicated(timestamp), dupid := 0]
data[duplicated(timestamp), dupid := 1:.N, .(device_id, timestamp)]
data[, timestamp2 := timestamp + microseconds(dupid)]
data[, timestamp := timestamp2]
setorder(data, device_id, timestamp)

Creating days

In order to facilitate comparison between methods, movement behavior was subdivided into 24 hour periods, starting at 4am in order to capture late night travel behavior occurring after midnight.

# Correct timezone display
data[, timestamp := with_tz(timestamp, tzone = "Europe/Amsterdam")]
data[, timestamp_date := floor_date(timestamp, "day")]

# Timestamps after 04:00 should be the timestamp date
# Timestamps between midnight and 03:59:59.999 should be the timestamp date minus one
data[,  adj_date := floor_date(timestamp - hours(4), "day")]
data[, day_of_week := wday(adj_date, label = TRUE)]
data[, day_of_week := ordered(day_of_week)]
data[, person_day := paste(device_id, format(adj_date, ("%d-%m")), sep = "_")]

Establishing missingness

We select our criteria for how much time must elapse before we will consider someone to have missing data. In this case, we opt for a maximum gap of 5 minutes. Because people often travel in a way that is similar from week to week, but may differ moderately between the days of the week, we are interested in recording movement for each day in the week for each person. The figure below shows the number of recorded hours per best weekday for each participant.

setorder(data, device_id, timestamp)
data[, ttn := shift(timestamp,n = -1) - timestamp, person_day]
data[, gap := TRUE]
data[ttn < minutes(5), gap := FALSE]

# Take the sum of all intervals shorter than the minimum gap length per day of
# week
coverage <-
  copy(data[gap == FALSE,
            .(hours = time_length(sum(ttn), "hours")),
            .(device_id, person_day, day_of_week)])

# For each day of the week for a person, find the one with the best coverage
cvplot <- 
  coverage[,
           .(max_hrs = round(max(hours), 2)),
           .(device_id, day_of_week)][order(device_id, day_of_week)]

# Find the week total
cvplot[, sumhrs := sum(max_hrs), device_id]

# Order the device_ids by the week total
cvplot[, device_id_ordered := fct_reorder(factor(device_id), sumhrs)]

# Tile map of participant weeks
ggplot(cvplot, aes(y = day_of_week, x = device_id_ordered, fill = max_hrs)) +
  geom_tile() +
  viridis::scale_fill_viridis(option = "A") +
  labs(x = "Participant", y = "", fill = "Hours")+
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.margin = margin(),
        legend.key.width = unit(3, "cm"),
        legend.key.height = unit(.5, "cm")) +
  coord_fixed(ratio = 20)

A participant with seven days of complete data should have a total of 168 hours. In fact, participants had a mean of 59.4, and a median of 54.4.

cvplot[, .SD[1], device_id] %>% 
  ggplot(aes(x = sumhrs)) +
  geom_histogram(binwidth = 4) +
  scale_x_continuous("Week hour sum") +
  theme_minimal()

# Results using other methods

Listwise deletion

The most typical method of addressing missingness is listwise deletion, in which only data with no gaps will be used. When the missingness mechanism is missing completely at random (MCAR; the probability of missing is the same for all cases), this should produce unbiased results, although it is less efficient, with larger standard errors than if all available data are used. Let us say that at least 95% of the total week time must be covered, giving us a cutoff of approximately 160 hours. This would leave us with 12 persons out of our total number of respondents of 604 We may then perform our analyses on the basis of only these data.

complete_week_ids <- cvplot[sumhrs > 160, unique(device_id)]
complete_week_pd <- coverage[device_id %in% complete_week_ids, person_day[hours == max(hours)], .(device_id, day_of_week)]$V1
complete_week_data <- copy(data[person_day %in% complete_week_pd])


for (i in seq_along(complete_week_pd)) {
  this_person_day <- complete_week_pd[i]
  res <- stopFinder(copy(complete_week_data[person_day == this_person_day]), thetaD = 200, thetaT = minutes(5))
  mergingCycle(res, thetaD = 200, small_track_action = "exclude", max_time = minutes(5), max_locs = Inf, max_dist = 200)
  events <- returnStateEvents(res)
  
  if (any(events$state == "moving")) {
    segs <- getSegments(tdtr(res[state == "moving"],
                             col_names = list(entity_id_col = "device_id", timestamp_col = "timestamp", latitude_col
                                              = "latitude", longitude_col = "longitude"),
                             group_col = "move_id", max_error = 30), group = TRUE)
    
    
    events[, total_travel_dist := sum(segs$segdist)]
    events[, n_trips := max(move_id, na.rm = TRUE)]
    events[, avg_dist_per_trip := total_travel_dist / n_trips]
    events[state == "moving", total_trip_time := sum(time_length(as.duration(end_time - begin_time), "minutes"))]
    events[, total_trip_time := na.exclude(total_trip_time)[1]]
    events[state == "moving", avg_trip_time := mean(time_length(as.duration(end_time - begin_time), "minutes"))]
    events[, avg_trip_time := na.exclude(avg_trip_time)[1]]
    
  } else {
    set(events,
        j = c("total_travel_dist", "avg_dist_per_trip", "total_trip_time", "avg_trip_time", "n_trips"),
        value = list(0))
  }
  if (any(events$state == "stopped")) {
    events[, n_stops := max(stop_id, na.rm = TRUE)]
    
  } else {
    set(events,
        j = c("n_stops"),
        value = list(0))
  }
  
  complete_week_data[person_day == this_person_day,
                     c("total_travel_dist", "avg_dist_per_trip", "total_trip_time", "avg_trip_time", "n_trips", "n_stops", "rog") :=
                       c(unique(events[, .(total_travel_dist, avg_dist_per_trip, total_trip_time, avg_trip_time, n_trips, n_stops)]), radiusOfGyrationDT(latitude, longitude, timestamp))]
}

tabdat <-
  unique(complete_week_data[, .(
    day_of_week,
    n_trips,
    n_stops,
    total_travel_dist,
    avg_dist_per_trip,
    total_trip_time,
    avg_trip_time,
    rog
  ), person_day])

# tabdat[, day_of_week := ordered(day_of_week)]
Means
Day Total Distance Mean Distance/Trip Total Travel Time Mean Time/Trip Trips Stops Radius of Gyration
Sun 23.1 km 7.4 km 28 min 10 min 1.5 2.6 2.7 km
Mon 21.6 km 5.0 km 34 min 9 min 2.9 5.1 3.6 km
Tue 33.0 km 11.3 km 42 min 13 min 2.9 5.2 5.5 km
Wed 39.8 km 11.5 km 46 min 13 min 3.1 4.6 6.0 km
Thu 42.5 km 13.1 km 56 min 16 min 3.9 5.8 5.9 km
Fri 62.5 km 12.9 km 66 min 12 min 4.1 6.3 14.1 km
Sat 58.9 km 13.2 km 67 min 16 min 3.2 5.2 10.8 km
Standard deviations
Day Total Distance Mean Distance/Trip Total Travel Time Mean Time/Trip Trips Stops Radius of Gyration
Sun 42.7 km 13.2 km 47 min 20 min 1.8 2.0 5.6 km
Mon 22.1 km 4.6 km 29 min 6 min 2.3 2.5 4.9 km
Tue 37.7 km 17.4 km 29 min 11 min 1.8 2.3 7.5 km
Wed 44.0 km 13.8 km 41 min 12 min 2.8 3.1 7.9 km
Thu 33.7 km 14.2 km 35 min 11 min 2.6 3.1 5.8 km
Fri 100.0 km 18.5 km 91 min 13 min 4.1 4.3 21.7 km
Sat 78.0 km 13.5 km 74 min 14 min 2.1 3.3 19.1 km

We would prefer not to draw conclusions from such a small sample – the standard deviations are in some cases more than twice the estimates themselves. We may additionally rightfully be concerned that this sample is no longer representative of the population we are interested in, and we may not be able to successfully weight the available data in order to compensate for the non-responders. It is therefore interesting to consider alternatives that allow us to use the data that we do have.

Pairwise deletion equivalent

One such alternative would be to consider the complete days only without requiring a complete week. This can be considered a rough analogue to pairwise deletion. Sample persons will not be equally represented in this case, which can be handled in various ways. How do our estimates differ if we consider all available complete days?

complete_days <-
  copy(data[gap == FALSE, per_day_coverage := sum(ttn), person_day][per_day_coverage > minutes(1368)])
complete_days_pd <- complete_days[, unique(person_day)]

for (i in seq_along(complete_days_pd)) {
  this_person_day <- complete_days_pd[i]
  res <-
    stopFinder(copy(complete_days[person_day == this_person_day]),
               thetaD = 200,
               thetaT = minutes(5))
  
  mergingCycle(
    res,
    thetaD = 200,
    small_track_action = "exclude",
    max_time = minutes(5),
    max_locs = Inf,
    max_dist = 200
  )
  
  events <- returnStateEvents(res)
  
  if (any(events$state == "moving")) {
    segs <- getSegments(tdtr(
      res[state == "moving"],
      col_names = list(
        entity_id_col = "device_id",
        timestamp_col = "timestamp",
        latitude_col
        = "latitude",
        longitude_col = "longitude"
      ),
      group_col = "move_id",
      max_error = 30
    ),
    group = TRUE)
    
    set(
      events,
      j = c(
        "total_travel_dist",
        "total_trip_time",
        "avg_trip_time",
        "n_trips"
      ),
      value = list(
        sum(segs[["segdist"]]),
        events[state == "moving", sum(time_length(as.duration(end_time - begin_time), "minutes"))],
        events[state == "moving", mean(time_length(as.duration(end_time - begin_time), "minutes"))],
        max(events[["move_id"]], na.rm = TRUE)
      )
    )
    events[, avg_dist_per_trip := total_travel_dist / n_trips]
    
    # events[, total_travel_dist := sum(segs$segdist)]
    # events[, n_trips := max(move_id, na.rm = TRUE)]
    # events[, total_trip_time := sum(end_time[state == "moving"] - begin_time[state == "moving"])]
    # events[, avg_trip_time := mean(end_time[state == "moving"] - begin_time[state == "moving"])]
    
  } else {
    set(
      events,
      j = c(
        "total_travel_dist",
        "avg_dist_per_trip",
        "total_trip_time",
        "avg_trip_time",
        "n_trips"
      ),
      value = list(0)
    )
  }
  if (any(events$state == "stopped")) {
    events[, n_stops := max(stop_id, na.rm = TRUE)]
    
  } else {
    set(events,
        j = c("n_stops"),
        value = list(0))
  }
  
  complete_days[person_day == this_person_day,
                c(
                  "total_travel_dist",
                  "avg_dist_per_trip",
                  "total_trip_time",
                  "avg_trip_time",
                  "n_trips",
                  "n_stops",
                  "rog"
                ) :=
                  c(unique(events[, .(
                    total_travel_dist,
                    avg_dist_per_trip,
                    total_trip_time,
                    avg_trip_time,
                    n_trips,
                    n_stops
                  )]),
                  radiusOfGyrationDT(latitude, longitude, timestamp))]
}

tabdat_day <-
  unique(complete_days[, .(
    day_of_week,
    n_trips,
    n_stops,
    total_travel_dist,
    avg_dist_per_trip,
    total_trip_time,
    avg_trip_time,
    rog
  ), person_day])

Considering only complete days, we end up with 320 days represented by 107 respondents. However, some users contribute only 1 day and others contribute 24 days. How do our estimates change?

Means
Day Total Distance Mean Distance/Trip Total Travel Time Mean Time/Trip Trips Stops Radius of Gyration
Sun 22.3 km 7.1 km 25 min 8 min 2.1 3.6 3.8 km
Mon 39.5 km 9.6 km 49 min 13 min 3.6 5.5 5.7 km
Tue 36.3 km 12.6 km 44 min 14 min 3.3 5.0 7.4 km
Wed 47.5 km 12.0 km 49 min 13 min 3.6 5.5 6.6 km
Thu 47.3 km 12.2 km 57 min 14 min 3.7 5.7 7.4 km
Fri 54.5 km 13.0 km 63 min 14 min 4.2 6.2 8.2 km
Sat 42.9 km 9.3 km 53 min 11 min 4.2 6.1 5.3 km
Standard deviations
Day Total Distance Mean Distance/Trip Total Travel Time Mean Time/Trip Trips Stops Radius of Gyration
Sun 36.4 km 12.2 km 30 min 9 min 1.9 2.6 9.6 km
Mon 53.9 km 11.3 km 44 min 11 min 2.5 3.0 9.0 km
Tue 58.8 km 24.9 km 43 min 17 min 2.3 2.7 17.3 km
Wed 70.8 km 17.2 km 48 min 12 min 2.8 3.3 11.1 km
Thu 60.2 km 14.4 km 49 min 11 min 2.7 3.3 9.3 km
Fri 78.6 km 19.5 km 66 min 13 min 3.4 4.0 14.1 km
Sat 46.8 km 10.8 km 43 min 10 min 3.0 3.6 7.4 km

The increase in the number of respondents between this and the previous one also allows us to begin to break out measurements along person characteristics, to weight according to the sampling procedure, and to correct for non-response. Let’s consider only total travel time, and the variables age, urbanicity and gender. If we keep only categories consisting of more than four persons, we have insufficient data to allow for interactions between these variables.

Cell counts

spss <- getSpssData()
Age <- zap_label(spss$LFT)
spss <- as_factor(spss)
setDT(spss)
complete_days[, username2 := as.numeric(username)]
spss[, spssusername := as.numeric(username)]
spss[, LFT := Age]
complete_days[spss, on = .(username2 == spssusername), `:=`(Sex = Geslacht,
                                        Age = LFT,
                                        Urbanicity = STEDBUURT)]

splitdat <- complete_days[, .(`Trip Time` = unique(total_trip_time)), .(person_day, Age, Sex, Urbanicity, os, device_id)]
splitdat[, Age := as.numeric(Age)]
splitdat[, Sex := factor(Sex, levels = c(1, 2), labels = c("Male", "Female"))]
splitdat[, Urban := factor(Urbanicity, levels = levels(splitdat$Urbanicity), labels = c("Very high", "High", "Moderate", "Slight", "Rural", "Unknown"))]
splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Urban, Age = cut(Age, breaks = c(15, 18, 35, 55, 77)), Sex)][order(Urban, Age, Sex)][N > 4] %>% 
  kable(digits = 0, caption = "Total travel time split by characteristics") %>% 
  kable_styling(full_width = FALSE)
Total travel time split by characteristics
Urban Age Sex mean min max sd med N
Very high (18,35] Female 55 0 135 33 58 7
High (35,55] Female 48 0 111 39 36 6
High (55,77] Male 38 0 93 33 33 5
Slight (35,55] Male 66 9 156 45 67 5
Slight (35,55] Female 71 0 124 34 76 5
Slight (55,77] Male 39 0 98 36 36 5
Rural (55,77] Male 62 0 223 73 22 6

We are only able to look at the main effects of these categories. Table .. might be indicative of differential effects, but with so few people in each group, finding effects will be difficult.

splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Urban)][order(Urban)][N > 4] %>% 
  kable(digits = 0, caption = "By urbanicity") %>% 
  kable_styling(full_width = FALSE) -> urbtab
splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Age = cut(Age, breaks = c(15, 18, 35, 55, 77)))][order(Age)][N > 4] %>% 
  kable(digits = 0, caption = "By age") %>% 
  kable_styling(full_width = FALSE)  -> agetab
splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Sex)][order(Sex)][N > 4] %>% 
  kable(digits = 0, caption = "By sex") %>% 
  kable_styling(full_width = FALSE)  -> sextab

cat(c('<table><tr valign="top"><td>', urbtab, '</td><td>', agetab, '</td><td>', sextab, '</td><tr></table>'),
    sep = '')
By urbanicity
Urban mean min max sd med N
Very high 43 0 172 34 36 20
High 37 0 124 35 31 25
Moderate 55 0 267 55 51 23
Slight 55 0 324 52 49 24
Rural 49 0 223 55 30 15
By age
Age mean min max sd med N
(18,35] 54 0 324 60 39 34
(35,55] 55 0 172 39 52 36
(55,77] 40 0 223 43 27 33
By sex
Sex mean min max sd med N
Male 52 0 324 57 36 55
Female 46 0 231 40 43 52

If we consider the number of people who sent us at least some location data, our N per category grows significantly. We can consider this the total possible N to achieve without considering non-response.

data[, username := as.numeric(username)]
data[spss, on = .(username == spssusername), `:=`(Sex = Geslacht, Age = LFT, Urbanicity = STEDBUURT)]
data[, Age := as.numeric(Age)]
data[, AgeC := cut(Age, breaks = c(14, 18, 35, 55, 91))]
# data[, Sex := factor(Sex, levels = c(1, 2), labels = c("Male", "Female"))]
data[, Urban := factor(Urbanicity, levels = levels(splitdat$Urbanicity), labels = c("Very high", "High", "Moderate", "Slight", "Rural", "Unknown"))]
data[, .(`Cell count` = uniqueN(username)), .(Urban, AgeC, Sex)][order(Urban, AgeC, Sex)] %>% kable() %>% kable_styling(full_width = F)
Urban AgeC Sex Cell count
Very high (14,18] 1 4
Very high (14,18] 2 2
Very high (18,35] 1 28
Very high (18,35] 2 30
Very high (35,55] 1 20
Very high (35,55] 2 17
Very high (55,91] 1 13
Very high (55,91] 2 13
High (14,18] 1 5
High (14,18] 2 5
High (18,35] 1 19
High (18,35] 2 21
High (35,55] 1 20
High (35,55] 2 33
High (55,91] 1 24
High (55,91] 2 17
Moderate (14,18] 1 3
Moderate (14,18] 2 7
Moderate (18,35] 1 13
Moderate (18,35] 2 16
Moderate (35,55] 1 20
Moderate (35,55] 2 21
Moderate (55,91] 1 17
Moderate (55,91] 2 17
Slight (14,18] 1 2
Slight (14,18] 2 5
Slight (18,35] 1 12
Slight (18,35] 2 13
Slight (35,55] 1 22
Slight (35,55] 2 20
Slight (55,91] 1 16
Slight (55,91] 2 21
Rural (14,18] 1 5
Rural (14,18] 2 4
Rural (18,35] 1 5
Rural (18,35] 2 12
Rural (35,55] 1 13
Rural (35,55] 2 17
Rural (55,91] 1 18
Rural (55,91] 2 6

DTWBI

McCool et al. (2022) describes a method for imputation of aggregate travel characteristics that preserves the maximum amount of data. It suggests a two-step process in which small gaps are first interpolated, and then long gaps are imputed on the basis of how similar the time series of the travel behavior is in a process called Dynamic Time Warping-Based Imputation (DTWBI). In our data, the majority (75%) of gaps are under 30 minutes in length.

Linear interpolation

data2 <- interpolatePoints(data, min_time = minutes(3), max_time = minutes(30), min_dist = -1)

data2[, newttn := shift(timestamp, n = -1) - timestamp, person_day]
data2[, gap := TRUE]
data2[newttn < minutes(5), gap := FALSE]

sg_complete <-
  copy(data2[gap == FALSE, per_day_coverage := sum(newttn), person_day][per_day_coverage > minutes(1368)])
sg_complete_pd <- sg_complete[, unique(person_day)]

for (i in seq_along(sg_complete_pd)) {
  this_person_day <- sg_complete_pd[i]
  res <-
    stopFinder(copy(sg_complete[person_day == this_person_day]),
               thetaD = 200,
               thetaT = minutes(5))
  
  mergingCycle(
    res,
    thetaD = 200,
    small_track_action = "exclude",
    max_time = minutes(5),
    max_locs = Inf,
    max_dist = 200
  )
  
  events <- returnStateEvents(res)
  
  if (any(events$state == "moving")) {
    segs <- getSegments(tdtr(
      res[state == "moving"],
      col_names = list(
        entity_id_col = "device_id",
        timestamp_col = "timestamp",
        latitude_col
        = "latitude",
        longitude_col = "longitude"
      ),
      group_col = "move_id",
      max_error = 30
    ),
    group = TRUE)
    
    set(
      events,
      j = c(
        "total_travel_dist",
        "total_trip_time",
        "avg_trip_time",
        "n_trips"
      ),
      value = list(
        sum(segs[["segdist"]]),
        events[state == "moving", sum(time_length(as.duration(end_time - begin_time), "minutes"))],
        events[state == "moving", mean(time_length(as.duration(end_time - begin_time), "minutes"))],
        max(events[["move_id"]], na.rm = TRUE)
      )
    )
    events[, avg_dist_per_trip := total_travel_dist / n_trips]
    
    # events[, total_travel_dist := sum(segs$segdist)]
    # events[, n_trips := max(move_id, na.rm = TRUE)]
    # events[, total_trip_time := sum(end_time[state == "moving"] - begin_time[state == "moving"])]
    # events[, avg_trip_time := mean(end_time[state == "moving"] - begin_time[state == "moving"])]
    
  } else {
    set(
      events,
      j = c(
        "total_travel_dist",
        "avg_dist_per_trip",
        "total_trip_time",
        "avg_trip_time",
        "n_trips"
      ),
      value = list(0)
    )
  }
  if (any(events$state == "stopped")) {
    events[, n_stops := max(stop_id, na.rm = TRUE)]
    
  } else {
    set(events,
        j = c("n_stops"),
        value = list(0))
  }
  
  sg_complete[person_day == this_person_day,
                c(
                  "total_travel_dist",
                  "avg_dist_per_trip",
                  "total_trip_time",
                  "avg_trip_time",
                  "n_trips",
                  "n_stops",
                  "rog"
                ) :=
                  c(unique(events[, .(
                    total_travel_dist,
                    avg_dist_per_trip,
                    total_trip_time,
                    avg_trip_time,
                    n_trips,
                    n_stops
                  )]),
                  radiusOfGyrationDT(latitude, longitude, timestamp))]
}
sg_complete[, day_of_week := na.exclude(day_of_week)[1], person_day]

tabdat_sg_day <-
  unique(sg_complete[, .(
    day_of_week,
    n_trips,
    n_stops,
    total_travel_dist,
    avg_dist_per_trip,
    total_trip_time,
    avg_trip_time,
    rog
  ), person_day])
Means
Day Total Distance Mean Distance/Trip Total Travel Time Mean Time/Trip Trips Stops Radius of Gyration
Sun 27.8 km 8.6 km 34 min 11 min 2.6 4.2 4.4 km
Mon 48.6 km 11.3 km 58 min 14 min 4.0 6.0 6.5 km
Tue 44.6 km 12.8 km 52 min 14 min 3.6 5.1 7.3 km
Wed 53.6 km 14.1 km 52 min 13 min 3.6 5.5 7.2 km
Thu 55.7 km 12.8 km 66 min 14 min 4.0 5.8 7.5 km
Fri 50.8 km 10.8 km 60 min 13 min 4.5 6.5 7.0 km
Sat 49.0 km 11.4 km 56 min 13 min 4.3 6.2 6.1 km
Standard deviations
Day Total Distance Mean Distance/Trip Total Travel Time Mean Time/Trip Trips Stops Radius of Gyration
Sun 39.6 km 12.7 km 34 min 12 min 2.3 2.9 9.8 km
Mon 81.1 km 21.0 km 63 min 11 min 3.3 3.9 12.0 km
Tue 56.8 km 20.9 km 49 min 15 min 2.5 2.9 13.7 km
Wed 87.5 km 27.6 km 51 min 13 min 2.8 3.2 13.9 km
Thu 82.6 km 18.5 km 75 min 13 min 3.9 4.1 10.6 km
Fri 71.7 km 16.5 km 58 min 12 min 3.7 4.4 11.8 km
Sat 61.0 km 17.4 km 50 min 12 min 3.1 3.7 10.7 km

Using only interpolation, we are able to use 604 days across 160 persons. We are interested in what can be gained from also filling in the long gaps. DTWBI looks at the pattern of movement behavior over time, aggregating it into time series with the goal of finding similar patterns. When there exists a long gap within a respondent’s day, in order to generate informative data against which we can compare, we need a buffer of sufficient length on either side of the missing data. The length of this buffer can vary – the shorter the buffer, the less specific we are when selecting imputation candidates. For our purposes, a buffer of at least one hour is necessary.

Problems

Because respondents may have multiple gaps within a 24-hour period and because some gaps may occur at the beginning or end of the period, not all days can be imputed with DTWBI. We require that the total sum of covered time before and after the gap is at least 110 minutes. To reduce our workload, if we can calculate that we will not have at least 95% of a full day covered (1368 minutes), we will not impute that day. It might be possible in the future to include these days so that they can become reference queries for imputation, even if they will lack sufficient information to be considered themselves.

data2[gap == FALSE, new_per_day_coverage := sum(newttn), person_day]
data2[, new_complete := FALSE]
data2[new_per_day_coverage >= minutes(1368), new_complete := TRUE]
data2[new_complete == FALSE, candtw := bufferTest(.SD, hours(1), minutes(1368)), person_day]

Requiring a buffer on both sides for DTWBI only provides us with 187 new days. This is suboptimal, and it occurs because we have prematurely divided our dataset into days. If instead, we maintained the data sets as uninterrupted streams until after the imputation, we could often get the data on both sides of, for example, night missingness. We have two options in this case: 1. DTWBI for beginning/end missingness, which incorporates match.buffers on only one side, or 2. perform DTWBI before the data are split on days, then split after imputation.

# data2[, lastttn := (hms::as_hms("04:00:00") - hms::as_hms(timestamp[.N])), person_day]
# data2[lastttn < 0, lastttn := lastttn + as.duration(hours(24))]
# data2[is.na(newttn), newttn := lastttn, person_day]

A problem we run into with solution 2 is that many of the long gaps (which could be handled with DTWBI) are interspersed with single measurements, which keeps us from having the full length of matching buffer. Essentially having those extra pieces of data keep the model from working. The figure below illustrates this issue. The black line shows the gap time, and the color on either side of it shows the one hour match buffer. The best solution here is likely to combine the intersecting gaps into a single gap, making note of the distance that is noted (if any) when we receive the single points flanked by long gaps. Then handle as necessary.

data3 <- copy(data2[, .(device_id, latitude, longitude, accuracy, timestamp, timestamp_date, adj_date, day_of_week, person_day)])
data3[, tfp := timestamp - shift(timestamp, n = 1), device_id]
# data3[, tfp := shift(ttn)]
data3[, newid := rleid(device_id, !is.na(tfp) & ifelse(tfp > hours(12), tfp, 0))]
data3[, person_block := paste(device_id, newid, sep = "_")]

data3[, ttn := shift(timestamp,n = -1) - timestamp, person_block]
data3[, gap := TRUE]
data3[ttn < minutes(5), gap := FALSE]

test <- data3[person_block == "100_1"]
gaps <- test[gap == TRUE, .(person_block, gapid = .I, gaptime = timestamp, ttn, start = timestamp - hours(1), end = timestamp + ttn + hours(1))]
ggplot(gaps, aes(x = gaptime, y = gapid)) +
  geom_linerange(aes(xmin = start, xmax = end, color = factor(gapid))) +
  geom_linerange(aes(xmin = gaptime, xmax = gaptime + ttn), fill = "black", size = 1) + 
  scale_x_datetime("Time")+
  scale_y_continuous("Gap number")+
  scale_color_discrete("Gap number")+
  theme_minimal() +
  theme(legend.position = "none")

gaps <- data3[gap == TRUE, .(gapid = 1:.N, gaptime = timestamp, start = timestamp - hours(1), end = timestamp + ttn + hours(1)), person_block]
gaps[, newgap := mergeGap(end, start), person_block]
gaps[, `:=`(newstart = min(start), newend = max(end, na.rm = TRUE)), .(person_block, newgap)]
gaps[start == newstart, forebuffer_end := gaptime]
gaps[end == newend, endbuffer_start := gaptime]
gaps[, `:=`(endbuffer_start = na.exclude(endbuffer_start)[1], forebuffer_end = na.exclude(forebuffer_end)[1]), .(person_block, newgap)]
data3[gaps,
     on = .(person_block, timestamp >= newstart, timestamp <= newend),
     `:=`(newgap = i.newgap, newstart = i.newstart, newend = i.newend, forebuffer_end = i.forebuffer_end, endbuffer_start = i.endbuffer_start)]

data3[, extra_point := FALSE]
data3[!is.na(newgap) & gap == FALSE, extra_point := timestamp %between% c(forebuffer_end[1], endbuffer_start[1]), .(person_block, newgap)]
data3[!is.na(newgap), candtw := testBufferSingleGap(.SD), .(person_block, newgap)]
data3[!is.na(newgap), atleast24h := testBufferAfterMerge(.SD, min.day.minutes = minutes(1368)), .(person_block)]
data3[gap == TRUE | extra_point == TRUE, interdist := sum(geodist_vec(longitude, latitude, sequential = TRUE, measure = "haversine")), .(person_block, newgap)]
data3[!is.na(newgap), interdist := na.exclude(interdist)[1], .(person_block, newgap)]
data3[, newgaplength := (endbuffer_start[1] - forebuffer_end[1]), .(person_block, newgap)]
data3[newgaplength > hours(12), candtw := FALSE]
data3[, aftermergeid := paste(person_block, rleid(newgap), sep = "_"), .(person_block)]

If we merge the long gaps, we end up with 2354 long gaps that can be imputed. DTWBI requires that we create uniform discrete time, aggregating the travel behavior within to that level. We have chosen 15 minutes to strike a balance between the periods being short enough to represent differential travel behavior and long enough to ensure that they encompass multiple location measurements when the minimum gap is set to 5 minutes in length.

setorder(data3, device_id, timestamp)
data3[, hasmissings := any(gap == TRUE), aftermergeid]
full <- copy(data3[hasmissings == FALSE | candtw == TRUE]) # we lose about 1m records and 4000 person_blocks containing not enough information
full[, time_elapsed := timestamp - timestamp[1], aftermergeid]
full[, ftchunk := floor(as.period(time_elapsed)/minutes(15))]
full[, fifteens := sprintf("%04d", ftchunk)]
full[, grouping := paste(aftermergeid, fifteens, sep = "-")]

setorder(full, aftermergeid, ftchunk)
started.at=proc.time()
res <- tdtr(full, col_names = list(entity_id_col = "aftermergeid", timestamp_col = "timestamp", latitude_col
                                   = "latitude", longitude_col = "longitude"), group_col = "grouping", max_error = 30)
cat("Finished in",timetaken(started.at),"\n") ## 13 minutes
saveRDS(res, "./data/tdtrdata_221810.RDS")
segs <- getSegments(res, group = TRUE)
saveRDS(segs, "./data/segs_221810.RDS")

We add in the missing time series entries, setting the distance to NA.

segs <- readRDS("./data/segs_221810.RDS")
setDT(segs)
databyfifteen <- segs[, .(tsstart = min(seg_start_time),
                          tsend = max(seg_end_time),
                          dist = sum(segdist),
                          tsstartlat = seg_start_lat[1],
                          tsendlat = seg_end_lat[.N],
                          tsstartlon = seg_start_lon[1],
                          tsendlon = seg_end_lon[.N]),
                      .(entity_id, .id)]
databyfifteen[, c("person_block", "fifteens") := tstrsplit(.id, split = "-")]
databyfifteen[, ftchunk := as.numeric(fifteens)]
databyfifteen[, tsstart := as_datetime(tsstart)]
databyfifteen[, tsend := as_datetime(tsend)]
databyfifteen[, first_timestamp := (min(tsstart)), entity_id]

missingfifteens <- databyfifteen[, seq(from = 0, to = max(ftchunk))[!seq(from = 0, to = max(ftchunk)) %in% ftchunk], entity_id]
missingfifteens[, `:=`(lastbefore = min(V1) - 1,
                       firstafter = max(V1) + 1), entity_id]
newrows <- databyfifteen[missingfifteens, on = .(entity_id, ftchunk == lastbefore),
                         .(entity_id,
                           .id = paste(entity_id, sprintf("%04d", i.V1), sep = "-"),
                           tsstart = first_timestamp + minutes(i.V1 * 15) ,
                           tsend = first_timestamp + minutes((i.V1 + 1) * 15),
                           dist = NA,
                           tsstartlat = x.tsendlat,
                           tsendlat = NA_real_,
                           tsstartlon = x.tsendlon,
                           tsendlon = NA_real_,
                           person_block,
                           fifteens = sprintf("%04d", i.V1),
                           ftchunk = i.V1,
                           first_timestamp,
                           firstafter)]
newrows[databyfifteen, on = .(entity_id, firstafter == ftchunk), `:=`(tsendlat = i.tsstartlat, tsendlon = i.tsstartlon)]
set(newrows, j = "firstafter", value = NULL)

databyfifteen <- rbindlist(list(
  databyfifteen, newrows
))

setorder(databyfifteen, entity_id, ftchunk)
databyfifteen[, first_timestamp := NULL]
databyfifteen[, time := hms::as_hms(tsstart)]
databyfifteen[, maxftchunk := max(ftchunk), entity_id]

This gives us 4928 complete or imputable slices of varying length. The next step is imputation using DTWBI with the Burst parameters. First we separate out the ‘queries’ which are the sets containing long gap missing data, and we order those by the amount of missingness.

n_imps <- 10
prob.power <- 4
window.size = hours(3)
match.window = 4

impdata <-
    copy(rbindlist(replicate(n_imps, databyfifteen, simplify = FALSE), idcol = "imp"))
impdata[, orig_dist := dist]

queries <- impdata[imp == 1, .SD[any(is.na(dist))], entity_id]
query_order <- queries[, sum(is.na(dist)), entity_id][order(V1, decreasing = FALSE), entity_id]

library(doParallel)
detectCores()
## [1] 16
registerDoParallel(10)

allimps <- foreach(
    this_imp = seq_len(n_imps),
    .combine = rbind,
    .inorder = FALSE,
    .multicombine = TRUE
  ) %dopar% {
    for (rechain_nr in seq_len(2)) {
      cat("Chain number", rechain_nr, "\n", file = "")
      # for (i in seq_along(query_order)) {
      for (i in 2) {
        refs <-
          impdata[imp == this_imp &
                    entity_id != query_order[i], # Select only those for this imputation chain
                  .SD[!any(is.na(dist))],                   # That are complete (includes previously imputed)
                  entity_id]                                     # Grouped by entity_id so we get only those with no NAs when we use any()
        this_query <-
          queries[entity_id == query_order[i]] # Select the query for innermost loop
        miss       <-
          this_query[, is.na(dist)]       # Extract the vector of missingness
        obs        <-
          !miss                           # The inverse observed vector

        # query <- this_query
        impdata <- imputeMidmissing(
          impdata  = impdata,
          this_imp = this_imp,
          query    = this_query,
          obs      = obs,
          refs     = refs,
          n_imps   = 1,
          prob.power   = prob.power,
          window       = window.size,
          match.window = match.window
        )

      } # End query iteration
    }   # End chaining iteration
    impdata[imp == this_imp]
  }     # End %dopar% over n_imps



  allimps[, `:=`(
    window.size = window.size,
    prob.power = prob.power,
    match.window = match.window
  )]
  
allimps[is.na(orig_dist) & !is.na(dist)]
ABCDEFGHIJ0123456789
imp
<int>
entity_id
<chr>
.id
<chr>
tsstart
<dttm>
tsend
<dttm>
dist
<dbl>
1199_980_10199_980_10-00042018-11-06 09:50:42.4382018-11-06 10:05:42.4380.000000
2199_980_10199_980_10-00042018-11-06 09:50:42.4382018-11-06 10:05:42.4383.696013
3199_980_10199_980_10-00042018-11-06 09:50:42.4382018-11-06 10:05:42.438154.512946
4199_980_10199_980_10-00042018-11-06 09:50:42.4382018-11-06 10:05:42.4388.658444
5199_980_10199_980_10-00042018-11-06 09:50:42.4382018-11-06 10:05:42.4380.939696
6199_980_10199_980_10-00042018-11-06 09:50:42.4382018-11-06 10:05:42.438128.711587
7199_980_10199_980_10-00042018-11-06 09:50:42.4382018-11-06 10:05:42.4382.530084
8199_980_10199_980_10-00042018-11-06 09:50:42.4382018-11-06 10:05:42.4389.753136
9199_980_10199_980_10-00042018-11-06 09:50:42.4382018-11-06 10:05:42.438360.101091
10199_980_10199_980_10-00042018-11-06 09:50:42.4382018-11-06 10:05:42.438703.494775