library(forcats) # Factor reordering
library(ggplot2) # Plots
library(data.table)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(geodist)
library(stopdetection)
library(topdowntimeratio)

## 
## Attaching package: 'topdowntimeratio'

## The following object is masked from 'package:stopdetection':
## 
##     radiusOfGyrationDT

source("utils.R")
source("dtwbi_functions.R")

## Loading required package: proxy

## 
## Attaching package: 'proxy'

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

## Loaded dtw v1.22-3. See ?dtw for help, citation("dtw") for use in publication.

## 
## Attaching package: 'hms'

## The following object is masked from 'package:lubridate':
## 
##     hms

knitr::opts_chunk$set(cache = FALSE, warning = FALSE, 
                      message = FALSE, cache.lazy = FALSE)

Data cleaning

Loading

Raw location data collected from respondents was used. All entries with an accuracy of 200 or higher were removed.

# library(stopdetection)

source("~/cbs_share/projecten/BPM/301707WaarnVnw_SEC1/Werk/Tabi Verplaatsingen App/R code Danielle McCool/New code suite/file_reading_functions.R")
source("~/cbs_share/projecten/BPM/301707WaarnVnw_SEC1/Werk/Tabi Verplaatsingen App/R code Danielle McCool/New code suite/file_merging_functions.R")
source("~/cbs_share/projecten/BPM/301707WaarnVnw_SEC1/Werk/Tabi Verplaatsingen App/R code Danielle McCool/New code suite/file_cleaning_functions.R")

# Set up reference data ---------------------------------------------------

data <- readInData(getNewFilenames(as.if.day = "2018-12-15"))

# Remove unneessary columns
data[, created_at := NULL]
data[, id := NULL]
data[, distance_between_previous_position := NULL]

# Remove duplicates
data <- copy(unique(data, by = c("device_id", "latitude", "longitude", "timestamp")))

# Remove data from cell towers
data <- data[accuracy < 200]

# We have to do the ordering a number of times
setorder(data, device_id, timestamp)


# These are all standard data loading I've used for two years
devices   <- readInData(getNewFilenames(type = "device", as.if.day = "2018-12-16"))
users     <- readInData(getNewFilenames(type = "user", as.if.day = "2018-12-16"))
data      <- mergeDataDevices(data, devices)
data      <- mergeDataUsers(data, users)
data      <- keepParticipants(data)
data <- data[!is.na(device_id)]
data <- data[!is.na(timestamp)]

data[, c("altitude", "speed", "desired_accuracy", "heading") := list(NULL)]
## Timestamp duplications
data[!duplicated(timestamp), dupid := 0]
data[duplicated(timestamp), dupid := 1:.N, .(device_id, timestamp)]
data[, timestamp2 := timestamp + microseconds(dupid)]
data[, timestamp := timestamp2]
setorder(data, device_id, timestamp)

Creating days

In order to facilitate comparison between methods, movement behavior was subdivided into 24 hour periods, starting at 4am in order to capture late night travel behavior occurring after midnight.

# Correct timezone display
data[, timestamp := with_tz(timestamp, tzone = "Europe/Amsterdam")]
data[, timestamp_date := floor_date(timestamp, "day")]

# Timestamps after 04:00 should be the timestamp date
# Timestamps between midnight and 03:59:59.999 should be the timestamp date minus one
data[,  adj_date := floor_date(timestamp - hours(4), "day")]
data[, day_of_week := wday(adj_date, label = TRUE)]
data[, day_of_week := ordered(day_of_week)]
data[, person_day := paste(device_id, format(adj_date, ("%d-%m")), sep = "_")]

Establishing missingness

We select our criteria for how much time must elapse before we will consider someone to have missing data. In this case, we opt for a maximum gap of 5 minutes. Because people often travel in a way that is similar from week to week, but may differ moderately between the days of the week, we are interested in recording movement for each day in the week for each person. The figure below shows the number of recorded hours per best weekday for each participant.

setorder(data, device_id, timestamp)
data[, ttn := shift(timestamp,n = -1) - timestamp, person_day]
data[, gap := TRUE]
data[ttn < minutes(5), gap := FALSE]

# Take the sum of all intervals shorter than the minimum gap length per day of
# week
coverage <-
  copy(data[gap == FALSE,
            .(hours = time_length(sum(ttn), "hours")),
            .(device_id, person_day, day_of_week)])

# For each day of the week for a person, find the one with the best coverage
cvplot <- 
  coverage[,
           .(max_hrs = round(max(hours), 2)),
           .(device_id, day_of_week)][order(device_id, day_of_week)]

# Find the week total
cvplot[, sumhrs := sum(max_hrs), device_id]

# Order the device_ids by the week total
cvplot[, device_id_ordered := fct_reorder(factor(device_id), sumhrs)]

# Tile map of participant weeks
ggplot(cvplot, aes(y = day_of_week, x = device_id_ordered, fill = max_hrs)) +
  geom_tile() +
  viridis::scale_fill_viridis(option = "A") +
  labs(x = "Participant", y = "", fill = "Hours")+
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.margin = margin(),
        legend.key.width = unit(3, "cm"),
        legend.key.height = unit(.5, "cm")) +
  coord_fixed(ratio = 20)

A participant with seven days of complete data should have a total of 168 hours. In fact, participants had a mean of 59.4, and a median of 54.4.

cvplot[, .SD[1], device_id] %>% 
  ggplot(aes(x = sumhrs)) +
  geom_histogram(binwidth = 4) +
  scale_x_continuous("Week hour sum") +
  theme_minimal()

# Results using other methods

Listwise deletion

The most typical method of addressing missingness is listwise deletion, in which only data with no gaps will be used. When the missingness mechanism is missing completely at random (MCAR; the probability of missing is the same for all cases), this should produce unbiased results, although it is less efficient, with larger standard errors than if all available data are used. Let us say that at least 95% of the total week time must be covered, giving us a cutoff of approximately 160 hours. This would leave us with 12 persons out of our total number of respondents of 604 We may then perform our analyses on the basis of only these data.

complete_week_ids <- cvplot[sumhrs > 160, unique(device_id)]
complete_week_pd <- coverage[device_id %in% complete_week_ids, person_day[hours == max(hours)], .(device_id, day_of_week)]$V1
complete_week_data <- copy(data[person_day %in% complete_week_pd])


for (i in seq_along(complete_week_pd)) {
  this_person_day <- complete_week_pd[i]
  res <- stopFinder(copy(complete_week_data[person_day == this_person_day]), thetaD = 200, thetaT = minutes(5))
  mergingCycle(res, thetaD = 200, small_track_action = "exclude", max_time = minutes(5), max_locs = Inf, max_dist = 200)
  events <- returnStateEvents(res)
  
  if (any(events$state == "moving")) {
    segs <- getSegments(tdtr(res[state == "moving"],
                             col_names = list(entity_id_col = "device_id", timestamp_col = "timestamp", latitude_col
                                              = "latitude", longitude_col = "longitude"),
                             group_col = "move_id", max_error = 30), group = TRUE)
    
    
    events[, total_travel_dist := sum(segs$segdist)]
    events[, n_trips := max(move_id, na.rm = TRUE)]
    events[, avg_dist_per_trip := total_travel_dist / n_trips]
    events[state == "moving", total_trip_time := sum(time_length(as.duration(end_time - begin_time), "minutes"))]
    events[, total_trip_time := na.exclude(total_trip_time)[1]]
    events[state == "moving", avg_trip_time := mean(time_length(as.duration(end_time - begin_time), "minutes"))]
    events[, avg_trip_time := na.exclude(avg_trip_time)[1]]
    
  } else {
    set(events,
        j = c("total_travel_dist", "avg_dist_per_trip", "total_trip_time", "avg_trip_time", "n_trips"),
        value = list(0))
  }
  if (any(events$state == "stopped")) {
    events[, n_stops := max(stop_id, na.rm = TRUE)]
    
  } else {
    set(events,
        j = c("n_stops"),
        value = list(0))
  }
  
  complete_week_data[person_day == this_person_day,
                     c("total_travel_dist", "avg_dist_per_trip", "total_trip_time", "avg_trip_time", "n_trips", "n_stops", "rog") :=
                       c(unique(events[, .(total_travel_dist, avg_dist_per_trip, total_trip_time, avg_trip_time, n_trips, n_stops)]), radiusOfGyrationDT(latitude, longitude, timestamp))]
}

tabdat <-
  unique(complete_week_data[, .(
    day_of_week,
    n_trips,
    n_stops,
    total_travel_dist,
    avg_dist_per_trip,
    total_trip_time,
    avg_trip_time,
    rog
  ), person_day])

# tabdat[, day_of_week := ordered(day_of_week)]

Means
Day	Total Distance	Mean Distance/Trip	Total Travel Time	Mean Time/Trip	Trips	Stops	Radius of Gyration
Sun	23.1 km	7.4 km	28 min	10 min	1.5	2.6	2.7 km
Mon	21.6 km	5.0 km	34 min	9 min	2.9	5.1	3.6 km
Tue	33.0 km	11.3 km	42 min	13 min	2.9	5.2	5.5 km
Wed	39.8 km	11.5 km	46 min	13 min	3.1	4.6	6.0 km
Thu	42.5 km	13.1 km	56 min	16 min	3.9	5.8	5.9 km
Fri	62.5 km	12.9 km	66 min	12 min	4.1	6.3	14.1 km
Sat	58.9 km	13.2 km	67 min	16 min	3.2	5.2	10.8 km

Standard deviations
Day	Total Distance	Mean Distance/Trip	Total Travel Time	Mean Time/Trip	Trips	Stops	Radius of Gyration
Sun	42.7 km	13.2 km	47 min	20 min	1.8	2.0	5.6 km
Mon	22.1 km	4.6 km	29 min	6 min	2.3	2.5	4.9 km
Tue	37.7 km	17.4 km	29 min	11 min	1.8	2.3	7.5 km
Wed	44.0 km	13.8 km	41 min	12 min	2.8	3.1	7.9 km
Thu	33.7 km	14.2 km	35 min	11 min	2.6	3.1	5.8 km
Fri	100.0 km	18.5 km	91 min	13 min	4.1	4.3	21.7 km
Sat	78.0 km	13.5 km	74 min	14 min	2.1	3.3	19.1 km

We would prefer not to draw conclusions from such a small sample – the standard deviations are in some cases more than twice the estimates themselves. We may additionally rightfully be concerned that this sample is no longer representative of the population we are interested in, and we may not be able to successfully weight the available data in order to compensate for the non-responders. It is therefore interesting to consider alternatives that allow us to use the data that we do have.

Pairwise deletion equivalent

One such alternative would be to consider the complete days only without requiring a complete week. This can be considered a rough analogue to pairwise deletion. Sample persons will not be equally represented in this case, which can be handled in various ways. How do our estimates differ if we consider all available complete days?

complete_days <-
  copy(data[gap == FALSE, per_day_coverage := sum(ttn), person_day][per_day_coverage > minutes(1368)])
complete_days_pd <- complete_days[, unique(person_day)]

for (i in seq_along(complete_days_pd)) {
  this_person_day <- complete_days_pd[i]
  res <-
    stopFinder(copy(complete_days[person_day == this_person_day]),
               thetaD = 200,
               thetaT = minutes(5))
  
  mergingCycle(
    res,
    thetaD = 200,
    small_track_action = "exclude",
    max_time = minutes(5),
    max_locs = Inf,
    max_dist = 200
  )
  
  events <- returnStateEvents(res)
  
  if (any(events$state == "moving")) {
    segs <- getSegments(tdtr(
      res[state == "moving"],
      col_names = list(
        entity_id_col = "device_id",
        timestamp_col = "timestamp",
        latitude_col
        = "latitude",
        longitude_col = "longitude"
      ),
      group_col = "move_id",
      max_error = 30
    ),
    group = TRUE)
    
    set(
      events,
      j = c(
        "total_travel_dist",
        "total_trip_time",
        "avg_trip_time",
        "n_trips"
      ),
      value = list(
        sum(segs[["segdist"]]),
        events[state == "moving", sum(time_length(as.duration(end_time - begin_time), "minutes"))],
        events[state == "moving", mean(time_length(as.duration(end_time - begin_time), "minutes"))],
        max(events[["move_id"]], na.rm = TRUE)
      )
    )
    events[, avg_dist_per_trip := total_travel_dist / n_trips]
    
    # events[, total_travel_dist := sum(segs$segdist)]
    # events[, n_trips := max(move_id, na.rm = TRUE)]
    # events[, total_trip_time := sum(end_time[state == "moving"] - begin_time[state == "moving"])]
    # events[, avg_trip_time := mean(end_time[state == "moving"] - begin_time[state == "moving"])]
    
  } else {
    set(
      events,
      j = c(
        "total_travel_dist",
        "avg_dist_per_trip",
        "total_trip_time",
        "avg_trip_time",
        "n_trips"
      ),
      value = list(0)
    )
  }
  if (any(events$state == "stopped")) {
    events[, n_stops := max(stop_id, na.rm = TRUE)]
    
  } else {
    set(events,
        j = c("n_stops"),
        value = list(0))
  }
  
  complete_days[person_day == this_person_day,
                c(
                  "total_travel_dist",
                  "avg_dist_per_trip",
                  "total_trip_time",
                  "avg_trip_time",
                  "n_trips",
                  "n_stops",
                  "rog"
                ) :=
                  c(unique(events[, .(
                    total_travel_dist,
                    avg_dist_per_trip,
                    total_trip_time,
                    avg_trip_time,
                    n_trips,
                    n_stops
                  )]),
                  radiusOfGyrationDT(latitude, longitude, timestamp))]
}

tabdat_day <-
  unique(complete_days[, .(
    day_of_week,
    n_trips,
    n_stops,
    total_travel_dist,
    avg_dist_per_trip,
    total_trip_time,
    avg_trip_time,
    rog
  ), person_day])

Considering only complete days, we end up with 320 days represented by 107 respondents. However, some users contribute only 1 day and others contribute 24 days. How do our estimates change?

Means
Day	Total Distance	Mean Distance/Trip	Total Travel Time	Mean Time/Trip	Trips	Stops	Radius of Gyration
Sun	22.3 km	7.1 km	25 min	8 min	2.1	3.6	3.8 km
Mon	39.5 km	9.6 km	49 min	13 min	3.6	5.5	5.7 km
Tue	36.3 km	12.6 km	44 min	14 min	3.3	5.0	7.4 km
Wed	47.5 km	12.0 km	49 min	13 min	3.6	5.5	6.6 km
Thu	47.3 km	12.2 km	57 min	14 min	3.7	5.7	7.4 km
Fri	54.5 km	13.0 km	63 min	14 min	4.2	6.2	8.2 km
Sat	42.9 km	9.3 km	53 min	11 min	4.2	6.1	5.3 km

Standard deviations
Day	Total Distance	Mean Distance/Trip	Total Travel Time	Mean Time/Trip	Trips	Stops	Radius of Gyration
Sun	36.4 km	12.2 km	30 min	9 min	1.9	2.6	9.6 km
Mon	53.9 km	11.3 km	44 min	11 min	2.5	3.0	9.0 km
Tue	58.8 km	24.9 km	43 min	17 min	2.3	2.7	17.3 km
Wed	70.8 km	17.2 km	48 min	12 min	2.8	3.3	11.1 km
Thu	60.2 km	14.4 km	49 min	11 min	2.7	3.3	9.3 km
Fri	78.6 km	19.5 km	66 min	13 min	3.4	4.0	14.1 km
Sat	46.8 km	10.8 km	43 min	10 min	3.0	3.6	7.4 km

The increase in the number of respondents between this and the previous one also allows us to begin to break out measurements along person characteristics, to weight according to the sampling procedure, and to correct for non-response. Let’s consider only total travel time, and the variables age, urbanicity and gender. If we keep only categories consisting of more than four persons, we have insufficient data to allow for interactions between these variables.

Cell counts

spss <- getSpssData()
Age <- zap_label(spss$LFT)
spss <- as_factor(spss)
setDT(spss)
complete_days[, username2 := as.numeric(username)]
spss[, spssusername := as.numeric(username)]
spss[, LFT := Age]
complete_days[spss, on = .(username2 == spssusername), `:=`(Sex = Geslacht,
                                        Age = LFT,
                                        Urbanicity = STEDBUURT)]

splitdat <- complete_days[, .(`Trip Time` = unique(total_trip_time)), .(person_day, Age, Sex, Urbanicity, os, device_id)]
splitdat[, Age := as.numeric(Age)]
splitdat[, Sex := factor(Sex, levels = c(1, 2), labels = c("Male", "Female"))]
splitdat[, Urban := factor(Urbanicity, levels = levels(splitdat$Urbanicity), labels = c("Very high", "High", "Moderate", "Slight", "Rural", "Unknown"))]
splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Urban, Age = cut(Age, breaks = c(15, 18, 35, 55, 77)), Sex)][order(Urban, Age, Sex)][N > 4] %>% 
  kable(digits = 0, caption = "Total travel time split by characteristics") %>% 
  kable_styling(full_width = FALSE)

Total travel time split by characteristics
Urban	Age	Sex	mean	min	max	sd	med	N
Very high	(18,35]	Female	55	0	135	33	58	7
High	(35,55]	Female	48	0	111	39	36	6
High	(55,77]	Male	38	0	93	33	33	5
Slight	(35,55]	Male	66	9	156	45	67	5
Slight	(35,55]	Female	71	0	124	34	76	5
Slight	(55,77]	Male	39	0	98	36	36	5
Rural	(55,77]	Male	62	0	223	73	22	6

We are only able to look at the main effects of these categories. Table .. might be indicative of differential effects, but with so few people in each group, finding effects will be difficult.

splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Urban)][order(Urban)][N > 4] %>% 
  kable(digits = 0, caption = "By urbanicity") %>% 
  kable_styling(full_width = FALSE) -> urbtab
splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Age = cut(Age, breaks = c(15, 18, 35, 55, 77)))][order(Age)][N > 4] %>% 
  kable(digits = 0, caption = "By age") %>% 
  kable_styling(full_width = FALSE)  -> agetab
splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Sex)][order(Sex)][N > 4] %>% 
  kable(digits = 0, caption = "By sex") %>% 
  kable_styling(full_width = FALSE)  -> sextab

cat(c('<table><tr valign="top"><td>', urbtab, '</td><td>', agetab, '</td><td>', sextab, '</td><tr></table>'),
    sep = '')

By urbanicity
Urban	mean	max	sd	med	N
Very high	43	172	34	36	20
High	37	124	35	31	25
Moderate	55	267	55	51	23
Slight	55	324	52	49	24
Rural	49	223	55	30	15

By age
Age	mean	max	sd	med	N
(18,35]	54	324	60	39	34
(35,55]	55	172	39	52	36
(55,77]	40	223	43	27	33

By sex
Sex	mean	min	max	sd	med	N
Male	52	0	324	57	36	55
Female	46	0	231	40	43	52

If we consider the number of people who sent us at least some location data, our N per category grows significantly. We can consider this the total possible N to achieve without considering non-response.

data[, username := as.numeric(username)]
data[spss, on = .(username == spssusername), `:=`(Sex = Geslacht, Age = LFT, Urbanicity = STEDBUURT)]
data[, Age := as.numeric(Age)]
data[, AgeC := cut(Age, breaks = c(14, 18, 35, 55, 91))]
# data[, Sex := factor(Sex, levels = c(1, 2), labels = c("Male", "Female"))]
data[, Urban := factor(Urbanicity, levels = levels(splitdat$Urbanicity), labels = c("Very high", "High", "Moderate", "Slight", "Rural", "Unknown"))]
data[, .(`Cell count` = uniqueN(username)), .(Urban, AgeC, Sex)][order(Urban, AgeC, Sex)] %>% kable() %>% kable_styling(full_width = F)

Urban	AgeC	Sex	Cell count
Very high	(14,18]	1	4
Very high	(14,18]	2	2
Very high	(18,35]	1	28
Very high	(18,35]	2	30
Very high	(35,55]	1	20
Very high	(35,55]	2	17
Very high	(55,91]	1	13
Very high	(55,91]	2	13
High	(14,18]	1	5
High	(14,18]	2	5
High	(18,35]	1	19
High	(18,35]	2	21
High	(35,55]	1	20
High	(35,55]	2	33
High	(55,91]	1	24
High	(55,91]	2	17
Moderate	(14,18]	1	3
Moderate	(14,18]	2	7
Moderate	(18,35]	1	13
Moderate	(18,35]	2	16
Moderate	(35,55]	1	20
Moderate	(35,55]	2	21
Moderate	(55,91]	1	17
Moderate	(55,91]	2	17
Slight	(14,18]	1	2
Slight	(14,18]	2	5
Slight	(18,35]	1	12
Slight	(18,35]	2	13
Slight	(35,55]	1	22
Slight	(35,55]	2	20
Slight	(55,91]	1	16
Slight	(55,91]	2	21
Rural	(14,18]	1	5
Rural	(14,18]	2	4
Rural	(18,35]	1	5
Rural	(18,35]	2	12
Rural	(35,55]	1	13
Rural	(35,55]	2	17
Rural	(55,91]	1	18
Rural	(55,91]	2	6

DTWBI

McCool et al. (2022) describes a method for imputation of aggregate travel characteristics that preserves the maximum amount of data. It suggests a two-step process in which small gaps are first interpolated, and then long gaps are imputed on the basis of how similar the time series of the travel behavior is in a process called Dynamic Time Warping-Based Imputation (DTWBI). In our data, the majority (75%) of gaps are under 30 minutes in length.

Linear interpolation

data2 <- interpolatePoints(data, min_time = minutes(3), max_time = minutes(30), min_dist = -1)

data2[, newttn := shift(timestamp, n = -1) - timestamp, person_day]
data2[, gap := TRUE]
data2[newttn < minutes(5), gap := FALSE]

sg_complete <-
  copy(data2[gap == FALSE, per_day_coverage := sum(newttn), person_day][per_day_coverage > minutes(1368)])

sg_complete_pd <- sg_complete[, unique(person_day)]

for (i in seq_along(sg_complete_pd)) {
  this_person_day <- sg_complete_pd[i]
  res <-
    stopFinder(copy(sg_complete[person_day == this_person_day]),
               thetaD = 200,
               thetaT = minutes(5))
  
  mergingCycle(
    res,
    thetaD = 200,
    small_track_action = "exclude",
    max_time = minutes(5),
    max_locs = Inf,
    max_dist = 200
  )
  
  events <- returnStateEvents(res)
  
  if (any(events$state == "moving")) {
    segs <- getSegments(tdtr(
      res[state == "moving"],
      col_names = list(
        entity_id_col = "device_id",
        timestamp_col = "timestamp",
        latitude_col
        = "latitude",
        longitude_col = "longitude"
      ),
      group_col = "move_id",
      max_error = 30
    ),
    group = TRUE)
    
    set(
      events,
      j = c(
        "total_travel_dist",
        "total_trip_time",
        "avg_trip_time",
        "n_trips"
      ),
      value = list(
        sum(segs[["segdist"]]),
        events[state == "moving", sum(time_length(as.duration(end_time - begin_time), "minutes"))],
        events[state == "moving", mean(time_length(as.duration(end_time - begin_time), "minutes"))],
        max(events[["move_id"]], na.rm = TRUE)
      )
    )
    events[, avg_dist_per_trip := total_travel_dist / n_trips]
    
    # events[, total_travel_dist := sum(segs$segdist)]
    # events[, n_trips := max(move_id, na.rm = TRUE)]
    # events[, total_trip_time := sum(end_time[state == "moving"] - begin_time[state == "moving"])]
    # events[, avg_trip_time := mean(end_time[state == "moving"] - begin_time[state == "moving"])]
    
  } else {
    set(
      events,
      j = c(
        "total_travel_dist",
        "avg_dist_per_trip",
        "total_trip_time",
        "avg_trip_time",
        "n_trips"
      ),
      value = list(0)
    )
  }
  if (any(events$state == "stopped")) {
    events[, n_stops := max(stop_id, na.rm = TRUE)]
    
  } else {
    set(events,
        j = c("n_stops"),
        value = list(0))
  }
  
  sg_complete[person_day == this_person_day,
                c(
                  "total_travel_dist",
                  "avg_dist_per_trip",
                  "total_trip_time",
                  "avg_trip_time",
                  "n_trips",
                  "n_stops",
                  "rog"
                ) :=
                  c(unique(events[, .(
                    total_travel_dist,
                    avg_dist_per_trip,
                    total_trip_time,
                    avg_trip_time,
                    n_trips,
                    n_stops
                  )]),
                  radiusOfGyrationDT(latitude, longitude, timestamp))]
}
sg_complete[, day_of_week := na.exclude(day_of_week)[1], person_day]

tabdat_sg_day <-
  unique(sg_complete[, .(
    day_of_week,
    n_trips,
    n_stops,
    total_travel_dist,
    avg_dist_per_trip,
    total_trip_time,
    avg_trip_time,
    rog
  ), person_day])

Means
Day	Total Distance	Mean Distance/Trip	Total Travel Time	Mean Time/Trip	Trips	Stops	Radius of Gyration
Sun	27.8 km	8.6 km	34 min	11 min	2.6	4.2	4.4 km
Mon	48.6 km	11.3 km	58 min	14 min	4.0	6.0	6.5 km
Tue	44.6 km	12.8 km	52 min	14 min	3.6	5.1	7.3 km
Wed	53.6 km	14.1 km	52 min	13 min	3.6	5.5	7.2 km
Thu	55.7 km	12.8 km	66 min	14 min	4.0	5.8	7.5 km
Fri	50.8 km	10.8 km	60 min	13 min	4.5	6.5	7.0 km
Sat	49.0 km	11.4 km	56 min	13 min	4.3	6.2	6.1 km

Standard deviations
Day	Total Distance	Mean Distance/Trip	Total Travel Time	Mean Time/Trip	Trips	Stops	Radius of Gyration
Sun	39.6 km	12.7 km	34 min	12 min	2.3	2.9	9.8 km
Mon	81.1 km	21.0 km	63 min	11 min	3.3	3.9	12.0 km
Tue	56.8 km	20.9 km	49 min	15 min	2.5	2.9	13.7 km
Wed	87.5 km	27.6 km	51 min	13 min	2.8	3.2	13.9 km
Thu	82.6 km	18.5 km	75 min	13 min	3.9	4.1	10.6 km
Fri	71.7 km	16.5 km	58 min	12 min	3.7	4.4	11.8 km
Sat	61.0 km	17.4 km	50 min	12 min	3.1	3.7	10.7 km

Using only interpolation, we are able to use 604 days across 160 persons. We are interested in what can be gained from also filling in the long gaps. DTWBI looks at the pattern of movement behavior over time, aggregating it into time series with the goal of finding similar patterns. When there exists a long gap within a respondent’s day, in order to generate informative data against which we can compare, we need a buffer of sufficient length on either side of the missing data. The length of this buffer can vary – the shorter the buffer, the less specific we are when selecting imputation candidates. For our purposes, a buffer of at least one hour is necessary.

Problems

Because respondents may have multiple gaps within a 24-hour period and because some gaps may occur at the beginning or end of the period, not all days can be imputed with DTWBI. We require that the total sum of covered time before and after the gap is at least 110 minutes. To reduce our workload, if we can calculate that we will not have at least 95% of a full day covered (1368 minutes), we will not impute that day. It might be possible in the future to include these days so that they can become reference queries for imputation, even if they will lack sufficient information to be considered themselves.

data2[gap == FALSE, new_per_day_coverage := sum(newttn), person_day]
data2[, new_complete := FALSE]
data2[new_per_day_coverage >= minutes(1368), new_complete := TRUE]
data2[new_complete == FALSE, candtw := bufferTest(.SD, hours(1), minutes(1368)), person_day]

Requiring a buffer on both sides for DTWBI only provides us with 187 new days. This is suboptimal, and it occurs because we have prematurely divided our dataset into days. If instead, we maintained the data sets as uninterrupted streams until after the imputation, we could often get the data on both sides of, for example, night missingness. We have two options in this case: 1. DTWBI for beginning/end missingness, which incorporates match.buffers on only one side, or 2. perform DTWBI before the data are split on days, then split after imputation.

# data2[, lastttn := (hms::as_hms("04:00:00") - hms::as_hms(timestamp[.N])), person_day]
# data2[lastttn < 0, lastttn := lastttn + as.duration(hours(24))]
# data2[is.na(newttn), newttn := lastttn, person_day]

A problem we run into with solution 2 is that many of the long gaps (which could be handled with DTWBI) are interspersed with single measurements, which keeps us from having the full length of matching buffer. Essentially having those extra pieces of data keep the model from working. The figure below illustrates this issue. The black line shows the gap time, and the color on either side of it shows the one hour match buffer. The best solution here is likely to combine the intersecting gaps into a single gap, making note of the distance that is noted (if any) when we receive the single points flanked by long gaps. Then handle as necessary.

data3 <- copy(data2[, .(device_id, latitude, longitude, accuracy, timestamp, timestamp_date, adj_date, day_of_week, person_day)])
data3[, tfp := timestamp - shift(timestamp, n = 1), device_id]
# data3[, tfp := shift(ttn)]
data3[, newid := rleid(device_id, !is.na(tfp) & ifelse(tfp > hours(12), tfp, 0))]
data3[, person_block := paste(device_id, newid, sep = "_")]

data3[, ttn := shift(timestamp,n = -1) - timestamp, person_block]
data3[, gap := TRUE]
data3[ttn < minutes(5), gap := FALSE]

test <- data3[person_block == "100_1"]
gaps <- test[gap == TRUE, .(person_block, gapid = .I, gaptime = timestamp, ttn, start = timestamp - hours(1), end = timestamp + ttn + hours(1))]
ggplot(gaps, aes(x = gaptime, y = gapid)) +
  geom_linerange(aes(xmin = start, xmax = end, color = factor(gapid))) +
  geom_linerange(aes(xmin = gaptime, xmax = gaptime + ttn), fill = "black", size = 1) + 
  scale_x_datetime("Time")+
  scale_y_continuous("Gap number")+
  scale_color_discrete("Gap number")+
  theme_minimal() +
  theme(legend.position = "none")

gaps <- data3[gap == TRUE, .(gapid = 1:.N, gaptime = timestamp, start = timestamp - hours(1), end = timestamp + ttn + hours(1)), person_block]
gaps[, newgap := mergeGap(end, start), person_block]
gaps[, `:=`(newstart = min(start), newend = max(end, na.rm = TRUE)), .(person_block, newgap)]
gaps[start == newstart, forebuffer_end := gaptime]
gaps[end == newend, endbuffer_start := gaptime]
gaps[, `:=`(endbuffer_start = na.exclude(endbuffer_start)[1], forebuffer_end = na.exclude(forebuffer_end)[1]), .(person_block, newgap)]
data3[gaps,
     on = .(person_block, timestamp >= newstart, timestamp <= newend),
     `:=`(newgap = i.newgap, newstart = i.newstart, newend = i.newend, forebuffer_end = i.forebuffer_end, endbuffer_start = i.endbuffer_start)]

data3[, extra_point := FALSE]
data3[!is.na(newgap) & gap == FALSE, extra_point := timestamp %between% c(forebuffer_end[1], endbuffer_start[1]), .(person_block, newgap)]
data3[!is.na(newgap), candtw := testBufferSingleGap(.SD), .(person_block, newgap)]
data3[!is.na(newgap), atleast24h := testBufferAfterMerge(.SD, min.day.minutes = minutes(1368)), .(person_block)]
data3[gap == TRUE | extra_point == TRUE, interdist := sum(geodist_vec(longitude, latitude, sequential = TRUE, measure = "haversine")), .(person_block, newgap)]
data3[!is.na(newgap), interdist := na.exclude(interdist)[1], .(person_block, newgap)]
data3[, newgaplength := (endbuffer_start[1] - forebuffer_end[1]), .(person_block, newgap)]
data3[newgaplength > hours(12), candtw := FALSE]
data3[, aftermergeid := paste(person_block, rleid(newgap), sep = "_"), .(person_block)]

If we merge the long gaps, we end up with 2354 long gaps that can be imputed. DTWBI requires that we create uniform discrete time, aggregating the travel behavior within to that level. We have chosen 15 minutes to strike a balance between the periods being short enough to represent differential travel behavior and long enough to ensure that they encompass multiple location measurements when the minimum gap is set to 5 minutes in length.

setorder(data3, device_id, timestamp)
data3[, hasmissings := any(gap == TRUE), aftermergeid]
full <- copy(data3[hasmissings == FALSE | candtw == TRUE]) # we lose about 1m records and 4000 person_blocks containing not enough information
full[, time_elapsed := timestamp - timestamp[1], aftermergeid]
full[, ftchunk := floor(as.period(time_elapsed)/minutes(15))]
full[, fifteens := sprintf("%04d", ftchunk)]
full[, grouping := paste(aftermergeid, fifteens, sep = "-")]

setorder(full, aftermergeid, ftchunk)
started.at=proc.time()
res <- tdtr(full, col_names = list(entity_id_col = "aftermergeid", timestamp_col = "timestamp", latitude_col
                                   = "latitude", longitude_col = "longitude"), group_col = "grouping", max_error = 30)
cat("Finished in",timetaken(started.at),"\n") ## 13 minutes
saveRDS(res, "./data/tdtrdata_221810.RDS")
segs <- getSegments(res, group = TRUE)
saveRDS(segs, "./data/segs_221810.RDS")

We add in the missing time series entries, setting the distance to NA.

segs <- readRDS("./data/segs_221810.RDS")
setDT(segs)
databyfifteen <- segs[, .(tsstart = min(seg_start_time),
                          tsend = max(seg_end_time),
                          dist = sum(segdist),
                          tsstartlat = seg_start_lat[1],
                          tsendlat = seg_end_lat[.N],
                          tsstartlon = seg_start_lon[1],
                          tsendlon = seg_end_lon[.N]),
                      .(entity_id, .id)]
databyfifteen[, c("person_block", "fifteens") := tstrsplit(.id, split = "-")]
databyfifteen[, ftchunk := as.numeric(fifteens)]
databyfifteen[, tsstart := as_datetime(tsstart)]
databyfifteen[, tsend := as_datetime(tsend)]
databyfifteen[, first_timestamp := (min(tsstart)), entity_id]

missingfifteens <- databyfifteen[, seq(from = 0, to = max(ftchunk))[!seq(from = 0, to = max(ftchunk)) %in% ftchunk], entity_id]
missingfifteens[, `:=`(lastbefore = min(V1) - 1,
                       firstafter = max(V1) + 1), entity_id]
newrows <- databyfifteen[missingfifteens, on = .(entity_id, ftchunk == lastbefore),
                         .(entity_id,
                           .id = paste(entity_id, sprintf("%04d", i.V1), sep = "-"),
                           tsstart = first_timestamp + minutes(i.V1 * 15) ,
                           tsend = first_timestamp + minutes((i.V1 + 1) * 15),
                           dist = NA,
                           tsstartlat = x.tsendlat,
                           tsendlat = NA_real_,
                           tsstartlon = x.tsendlon,
                           tsendlon = NA_real_,
                           person_block,
                           fifteens = sprintf("%04d", i.V1),
                           ftchunk = i.V1,
                           first_timestamp,
                           firstafter)]
newrows[databyfifteen, on = .(entity_id, firstafter == ftchunk), `:=`(tsendlat = i.tsstartlat, tsendlon = i.tsstartlon)]
set(newrows, j = "firstafter", value = NULL)

databyfifteen <- rbindlist(list(
  databyfifteen, newrows
))

setorder(databyfifteen, entity_id, ftchunk)
databyfifteen[, first_timestamp := NULL]
databyfifteen[, time := hms::as_hms(tsstart)]
databyfifteen[, maxftchunk := max(ftchunk), entity_id]

This gives us 4928 complete or imputable slices of varying length. The next step is imputation using DTWBI with the Burst parameters. First we separate out the ‘queries’ which are the sets containing long gap missing data, and we order those by the amount of missingness.

n_imps <- 10
prob.power <- 4
window.size = hours(3)
match.window = 4

impdata <-
    copy(rbindlist(replicate(n_imps, databyfifteen, simplify = FALSE), idcol = "imp"))
impdata[, orig_dist := dist]

queries <- impdata[imp == 1, .SD[any(is.na(dist))], entity_id]
query_order <- queries[, sum(is.na(dist)), entity_id][order(V1, decreasing = FALSE), entity_id]

library(doParallel)
detectCores()

## [1] 16

registerDoParallel(10)

allimps <- foreach(
    this_imp = seq_len(n_imps),
    .combine = rbind,
    .inorder = FALSE,
    .multicombine = TRUE
  ) %dopar% {
    for (rechain_nr in seq_len(2)) {
      cat("Chain number", rechain_nr, "\n", file = "")
      # for (i in seq_along(query_order)) {
      for (i in 2) {
        refs <-
          impdata[imp == this_imp &
                    entity_id != query_order[i], # Select only those for this imputation chain
                  .SD[!any(is.na(dist))],                   # That are complete (includes previously imputed)
                  entity_id]                                     # Grouped by entity_id so we get only those with no NAs when we use any()
        this_query <-
          queries[entity_id == query_order[i]] # Select the query for innermost loop
        miss       <-
          this_query[, is.na(dist)]       # Extract the vector of missingness
        obs        <-
          !miss                           # The inverse observed vector

        # query <- this_query
        impdata <- imputeMidmissing(
          impdata  = impdata,
          this_imp = this_imp,
          query    = this_query,
          obs      = obs,
          refs     = refs,
          n_imps   = 1,
          prob.power   = prob.power,
          window       = window.size,
          match.window = match.window
        )

      } # End query iteration
    }   # End chaining iteration
    impdata[imp == this_imp]
  }     # End %dopar% over n_imps



  allimps[, `:=`(
    window.size = window.size,
    prob.power = prob.power,
    match.window = match.window
  )]
  
allimps[is.na(orig_dist) & !is.na(dist)]

ABCDEFGHIJ0123456789

imp <int>	entity_id <chr>	.id <chr>	tsstart <dttm>	tsend <dttm>	dist <dbl>
1	199_980_10	199_980_10-0004	2018-11-06 09:50:42.438	2018-11-06 10:05:42.438	0.000000
2	199_980_10	199_980_10-0004	2018-11-06 09:50:42.438	2018-11-06 10:05:42.438	3.696013
3	199_980_10	199_980_10-0004	2018-11-06 09:50:42.438	2018-11-06 10:05:42.438	154.512946
4	199_980_10	199_980_10-0004	2018-11-06 09:50:42.438	2018-11-06 10:05:42.438	8.658444
5	199_980_10	199_980_10-0004	2018-11-06 09:50:42.438	2018-11-06 10:05:42.438	0.939696
6	199_980_10	199_980_10-0004	2018-11-06 09:50:42.438	2018-11-06 10:05:42.438	128.711587
7	199_980_10	199_980_10-0004	2018-11-06 09:50:42.438	2018-11-06 10:05:42.438	2.530084
8	199_980_10	199_980_10-0004	2018-11-06 09:50:42.438	2018-11-06 10:05:42.438	9.753136
9	199_980_10	199_980_10-0004	2018-11-06 09:50:42.438	2018-11-06 10:05:42.438	360.101091
10	199_980_10	199_980_10-0004	2018-11-06 09:50:42.438	2018-11-06 10:05:42.438	703.494775

Paper 4 Analyses