library(forcats) # Factor reordering
library(ggplot2) # Plots
library(data.table)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(geodist)
library(stopdetection)
library(topdowntimeratio)
##
## Attaching package: 'topdowntimeratio'
## The following object is masked from 'package:stopdetection':
##
## radiusOfGyrationDT
source("utils.R")
source("dtwbi_functions.R")
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loaded dtw v1.22-3. See ?dtw for help, citation("dtw") for use in publication.
##
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
##
## hms
knitr::opts_chunk$set(cache = FALSE, warning = FALSE,
message = FALSE, cache.lazy = FALSE)
Raw location data collected from respondents was used. All entries with an accuracy of 200 or higher were removed.
# library(stopdetection)
source("~/cbs_share/projecten/BPM/301707WaarnVnw_SEC1/Werk/Tabi Verplaatsingen App/R code Danielle McCool/New code suite/file_reading_functions.R")
source("~/cbs_share/projecten/BPM/301707WaarnVnw_SEC1/Werk/Tabi Verplaatsingen App/R code Danielle McCool/New code suite/file_merging_functions.R")
source("~/cbs_share/projecten/BPM/301707WaarnVnw_SEC1/Werk/Tabi Verplaatsingen App/R code Danielle McCool/New code suite/file_cleaning_functions.R")
# Set up reference data ---------------------------------------------------
data <- readInData(getNewFilenames(as.if.day = "2018-12-15"))
# Remove unneessary columns
data[, created_at := NULL]
data[, id := NULL]
data[, distance_between_previous_position := NULL]
# Remove duplicates
data <- copy(unique(data, by = c("device_id", "latitude", "longitude", "timestamp")))
# Remove data from cell towers
data <- data[accuracy < 200]
# We have to do the ordering a number of times
setorder(data, device_id, timestamp)
# These are all standard data loading I've used for two years
devices <- readInData(getNewFilenames(type = "device", as.if.day = "2018-12-16"))
users <- readInData(getNewFilenames(type = "user", as.if.day = "2018-12-16"))
data <- mergeDataDevices(data, devices)
data <- mergeDataUsers(data, users)
data <- keepParticipants(data)
data <- data[!is.na(device_id)]
data <- data[!is.na(timestamp)]
data[, c("altitude", "speed", "desired_accuracy", "heading") := list(NULL)]
## Timestamp duplications
data[!duplicated(timestamp), dupid := 0]
data[duplicated(timestamp), dupid := 1:.N, .(device_id, timestamp)]
data[, timestamp2 := timestamp + microseconds(dupid)]
data[, timestamp := timestamp2]
setorder(data, device_id, timestamp)
In order to facilitate comparison between methods, movement behavior was subdivided into 24 hour periods, starting at 4am in order to capture late night travel behavior occurring after midnight.
# Correct timezone display
data[, timestamp := with_tz(timestamp, tzone = "Europe/Amsterdam")]
data[, timestamp_date := floor_date(timestamp, "day")]
# Timestamps after 04:00 should be the timestamp date
# Timestamps between midnight and 03:59:59.999 should be the timestamp date minus one
data[, adj_date := floor_date(timestamp - hours(4), "day")]
data[, day_of_week := wday(adj_date, label = TRUE)]
data[, day_of_week := ordered(day_of_week)]
data[, person_day := paste(device_id, format(adj_date, ("%d-%m")), sep = "_")]
We select our criteria for how much time must elapse before we will consider someone to have missing data. In this case, we opt for a maximum gap of 5 minutes. Because people often travel in a way that is similar from week to week, but may differ moderately between the days of the week, we are interested in recording movement for each day in the week for each person. The figure below shows the number of recorded hours per best weekday for each participant.
setorder(data, device_id, timestamp)
data[, ttn := shift(timestamp,n = -1) - timestamp, person_day]
data[, gap := TRUE]
data[ttn < minutes(5), gap := FALSE]
# Take the sum of all intervals shorter than the minimum gap length per day of
# week
coverage <-
copy(data[gap == FALSE,
.(hours = time_length(sum(ttn), "hours")),
.(device_id, person_day, day_of_week)])
# For each day of the week for a person, find the one with the best coverage
cvplot <-
coverage[,
.(max_hrs = round(max(hours), 2)),
.(device_id, day_of_week)][order(device_id, day_of_week)]
# Find the week total
cvplot[, sumhrs := sum(max_hrs), device_id]
# Order the device_ids by the week total
cvplot[, device_id_ordered := fct_reorder(factor(device_id), sumhrs)]
# Tile map of participant weeks
ggplot(cvplot, aes(y = day_of_week, x = device_id_ordered, fill = max_hrs)) +
geom_tile() +
viridis::scale_fill_viridis(option = "A") +
labs(x = "Participant", y = "", fill = "Hours")+
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "bottom",
legend.direction = "horizontal",
legend.margin = margin(),
legend.key.width = unit(3, "cm"),
legend.key.height = unit(.5, "cm")) +
coord_fixed(ratio = 20)
A participant with seven days of complete data should have a total of 168 hours. In fact, participants had a mean of 59.4, and a median of 54.4.
cvplot[, .SD[1], device_id] %>%
ggplot(aes(x = sumhrs)) +
geom_histogram(binwidth = 4) +
scale_x_continuous("Week hour sum") +
theme_minimal()
# Results using other methods
The most typical method of addressing missingness is listwise deletion, in which only data with no gaps will be used. When the missingness mechanism is missing completely at random (MCAR; the probability of missing is the same for all cases), this should produce unbiased results, although it is less efficient, with larger standard errors than if all available data are used. Let us say that at least 95% of the total week time must be covered, giving us a cutoff of approximately 160 hours. This would leave us with 12 persons out of our total number of respondents of 604 We may then perform our analyses on the basis of only these data.
complete_week_ids <- cvplot[sumhrs > 160, unique(device_id)]
complete_week_pd <- coverage[device_id %in% complete_week_ids, person_day[hours == max(hours)], .(device_id, day_of_week)]$V1
complete_week_data <- copy(data[person_day %in% complete_week_pd])
for (i in seq_along(complete_week_pd)) {
this_person_day <- complete_week_pd[i]
res <- stopFinder(copy(complete_week_data[person_day == this_person_day]), thetaD = 200, thetaT = minutes(5))
mergingCycle(res, thetaD = 200, small_track_action = "exclude", max_time = minutes(5), max_locs = Inf, max_dist = 200)
events <- returnStateEvents(res)
if (any(events$state == "moving")) {
segs <- getSegments(tdtr(res[state == "moving"],
col_names = list(entity_id_col = "device_id", timestamp_col = "timestamp", latitude_col
= "latitude", longitude_col = "longitude"),
group_col = "move_id", max_error = 30), group = TRUE)
events[, total_travel_dist := sum(segs$segdist)]
events[, n_trips := max(move_id, na.rm = TRUE)]
events[, avg_dist_per_trip := total_travel_dist / n_trips]
events[state == "moving", total_trip_time := sum(time_length(as.duration(end_time - begin_time), "minutes"))]
events[, total_trip_time := na.exclude(total_trip_time)[1]]
events[state == "moving", avg_trip_time := mean(time_length(as.duration(end_time - begin_time), "minutes"))]
events[, avg_trip_time := na.exclude(avg_trip_time)[1]]
} else {
set(events,
j = c("total_travel_dist", "avg_dist_per_trip", "total_trip_time", "avg_trip_time", "n_trips"),
value = list(0))
}
if (any(events$state == "stopped")) {
events[, n_stops := max(stop_id, na.rm = TRUE)]
} else {
set(events,
j = c("n_stops"),
value = list(0))
}
complete_week_data[person_day == this_person_day,
c("total_travel_dist", "avg_dist_per_trip", "total_trip_time", "avg_trip_time", "n_trips", "n_stops", "rog") :=
c(unique(events[, .(total_travel_dist, avg_dist_per_trip, total_trip_time, avg_trip_time, n_trips, n_stops)]), radiusOfGyrationDT(latitude, longitude, timestamp))]
}
tabdat <-
unique(complete_week_data[, .(
day_of_week,
n_trips,
n_stops,
total_travel_dist,
avg_dist_per_trip,
total_trip_time,
avg_trip_time,
rog
), person_day])
# tabdat[, day_of_week := ordered(day_of_week)]
Day | Total Distance | Mean Distance/Trip | Total Travel Time | Mean Time/Trip | Trips | Stops | Radius of Gyration |
---|---|---|---|---|---|---|---|
Sun | 23.1 km | 7.4 km | 28 min | 10 min | 1.5 | 2.6 | 2.7 km |
Mon | 21.6 km | 5.0 km | 34 min | 9 min | 2.9 | 5.1 | 3.6 km |
Tue | 33.0 km | 11.3 km | 42 min | 13 min | 2.9 | 5.2 | 5.5 km |
Wed | 39.8 km | 11.5 km | 46 min | 13 min | 3.1 | 4.6 | 6.0 km |
Thu | 42.5 km | 13.1 km | 56 min | 16 min | 3.9 | 5.8 | 5.9 km |
Fri | 62.5 km | 12.9 km | 66 min | 12 min | 4.1 | 6.3 | 14.1 km |
Sat | 58.9 km | 13.2 km | 67 min | 16 min | 3.2 | 5.2 | 10.8 km |
Day | Total Distance | Mean Distance/Trip | Total Travel Time | Mean Time/Trip | Trips | Stops | Radius of Gyration |
---|---|---|---|---|---|---|---|
Sun | 42.7 km | 13.2 km | 47 min | 20 min | 1.8 | 2.0 | 5.6 km |
Mon | 22.1 km | 4.6 km | 29 min | 6 min | 2.3 | 2.5 | 4.9 km |
Tue | 37.7 km | 17.4 km | 29 min | 11 min | 1.8 | 2.3 | 7.5 km |
Wed | 44.0 km | 13.8 km | 41 min | 12 min | 2.8 | 3.1 | 7.9 km |
Thu | 33.7 km | 14.2 km | 35 min | 11 min | 2.6 | 3.1 | 5.8 km |
Fri | 100.0 km | 18.5 km | 91 min | 13 min | 4.1 | 4.3 | 21.7 km |
Sat | 78.0 km | 13.5 km | 74 min | 14 min | 2.1 | 3.3 | 19.1 km |
We would prefer not to draw conclusions from such a small sample – the standard deviations are in some cases more than twice the estimates themselves. We may additionally rightfully be concerned that this sample is no longer representative of the population we are interested in, and we may not be able to successfully weight the available data in order to compensate for the non-responders. It is therefore interesting to consider alternatives that allow us to use the data that we do have.
One such alternative would be to consider the complete days only without requiring a complete week. This can be considered a rough analogue to pairwise deletion. Sample persons will not be equally represented in this case, which can be handled in various ways. How do our estimates differ if we consider all available complete days?
complete_days <-
copy(data[gap == FALSE, per_day_coverage := sum(ttn), person_day][per_day_coverage > minutes(1368)])
complete_days_pd <- complete_days[, unique(person_day)]
for (i in seq_along(complete_days_pd)) {
this_person_day <- complete_days_pd[i]
res <-
stopFinder(copy(complete_days[person_day == this_person_day]),
thetaD = 200,
thetaT = minutes(5))
mergingCycle(
res,
thetaD = 200,
small_track_action = "exclude",
max_time = minutes(5),
max_locs = Inf,
max_dist = 200
)
events <- returnStateEvents(res)
if (any(events$state == "moving")) {
segs <- getSegments(tdtr(
res[state == "moving"],
col_names = list(
entity_id_col = "device_id",
timestamp_col = "timestamp",
latitude_col
= "latitude",
longitude_col = "longitude"
),
group_col = "move_id",
max_error = 30
),
group = TRUE)
set(
events,
j = c(
"total_travel_dist",
"total_trip_time",
"avg_trip_time",
"n_trips"
),
value = list(
sum(segs[["segdist"]]),
events[state == "moving", sum(time_length(as.duration(end_time - begin_time), "minutes"))],
events[state == "moving", mean(time_length(as.duration(end_time - begin_time), "minutes"))],
max(events[["move_id"]], na.rm = TRUE)
)
)
events[, avg_dist_per_trip := total_travel_dist / n_trips]
# events[, total_travel_dist := sum(segs$segdist)]
# events[, n_trips := max(move_id, na.rm = TRUE)]
# events[, total_trip_time := sum(end_time[state == "moving"] - begin_time[state == "moving"])]
# events[, avg_trip_time := mean(end_time[state == "moving"] - begin_time[state == "moving"])]
} else {
set(
events,
j = c(
"total_travel_dist",
"avg_dist_per_trip",
"total_trip_time",
"avg_trip_time",
"n_trips"
),
value = list(0)
)
}
if (any(events$state == "stopped")) {
events[, n_stops := max(stop_id, na.rm = TRUE)]
} else {
set(events,
j = c("n_stops"),
value = list(0))
}
complete_days[person_day == this_person_day,
c(
"total_travel_dist",
"avg_dist_per_trip",
"total_trip_time",
"avg_trip_time",
"n_trips",
"n_stops",
"rog"
) :=
c(unique(events[, .(
total_travel_dist,
avg_dist_per_trip,
total_trip_time,
avg_trip_time,
n_trips,
n_stops
)]),
radiusOfGyrationDT(latitude, longitude, timestamp))]
}
tabdat_day <-
unique(complete_days[, .(
day_of_week,
n_trips,
n_stops,
total_travel_dist,
avg_dist_per_trip,
total_trip_time,
avg_trip_time,
rog
), person_day])
Considering only complete days, we end up with 320 days represented by 107 respondents. However, some users contribute only 1 day and others contribute 24 days. How do our estimates change?
Day | Total Distance | Mean Distance/Trip | Total Travel Time | Mean Time/Trip | Trips | Stops | Radius of Gyration |
---|---|---|---|---|---|---|---|
Sun | 22.3 km | 7.1 km | 25 min | 8 min | 2.1 | 3.6 | 3.8 km |
Mon | 39.5 km | 9.6 km | 49 min | 13 min | 3.6 | 5.5 | 5.7 km |
Tue | 36.3 km | 12.6 km | 44 min | 14 min | 3.3 | 5.0 | 7.4 km |
Wed | 47.5 km | 12.0 km | 49 min | 13 min | 3.6 | 5.5 | 6.6 km |
Thu | 47.3 km | 12.2 km | 57 min | 14 min | 3.7 | 5.7 | 7.4 km |
Fri | 54.5 km | 13.0 km | 63 min | 14 min | 4.2 | 6.2 | 8.2 km |
Sat | 42.9 km | 9.3 km | 53 min | 11 min | 4.2 | 6.1 | 5.3 km |
Day | Total Distance | Mean Distance/Trip | Total Travel Time | Mean Time/Trip | Trips | Stops | Radius of Gyration |
---|---|---|---|---|---|---|---|
Sun | 36.4 km | 12.2 km | 30 min | 9 min | 1.9 | 2.6 | 9.6 km |
Mon | 53.9 km | 11.3 km | 44 min | 11 min | 2.5 | 3.0 | 9.0 km |
Tue | 58.8 km | 24.9 km | 43 min | 17 min | 2.3 | 2.7 | 17.3 km |
Wed | 70.8 km | 17.2 km | 48 min | 12 min | 2.8 | 3.3 | 11.1 km |
Thu | 60.2 km | 14.4 km | 49 min | 11 min | 2.7 | 3.3 | 9.3 km |
Fri | 78.6 km | 19.5 km | 66 min | 13 min | 3.4 | 4.0 | 14.1 km |
Sat | 46.8 km | 10.8 km | 43 min | 10 min | 3.0 | 3.6 | 7.4 km |
The increase in the number of respondents between this and the previous one also allows us to begin to break out measurements along person characteristics, to weight according to the sampling procedure, and to correct for non-response. Let’s consider only total travel time, and the variables age, urbanicity and gender. If we keep only categories consisting of more than four persons, we have insufficient data to allow for interactions between these variables.
spss <- getSpssData()
Age <- zap_label(spss$LFT)
spss <- as_factor(spss)
setDT(spss)
complete_days[, username2 := as.numeric(username)]
spss[, spssusername := as.numeric(username)]
spss[, LFT := Age]
complete_days[spss, on = .(username2 == spssusername), `:=`(Sex = Geslacht,
Age = LFT,
Urbanicity = STEDBUURT)]
splitdat <- complete_days[, .(`Trip Time` = unique(total_trip_time)), .(person_day, Age, Sex, Urbanicity, os, device_id)]
splitdat[, Age := as.numeric(Age)]
splitdat[, Sex := factor(Sex, levels = c(1, 2), labels = c("Male", "Female"))]
splitdat[, Urban := factor(Urbanicity, levels = levels(splitdat$Urbanicity), labels = c("Very high", "High", "Moderate", "Slight", "Rural", "Unknown"))]
splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Urban, Age = cut(Age, breaks = c(15, 18, 35, 55, 77)), Sex)][order(Urban, Age, Sex)][N > 4] %>%
kable(digits = 0, caption = "Total travel time split by characteristics") %>%
kable_styling(full_width = FALSE)
Urban | Age | Sex | mean | min | max | sd | med | N |
---|---|---|---|---|---|---|---|---|
Very high | (18,35] | Female | 55 | 0 | 135 | 33 | 58 | 7 |
High | (35,55] | Female | 48 | 0 | 111 | 39 | 36 | 6 |
High | (55,77] | Male | 38 | 0 | 93 | 33 | 33 | 5 |
Slight | (35,55] | Male | 66 | 9 | 156 | 45 | 67 | 5 |
Slight | (35,55] | Female | 71 | 0 | 124 | 34 | 76 | 5 |
Slight | (55,77] | Male | 39 | 0 | 98 | 36 | 36 | 5 |
Rural | (55,77] | Male | 62 | 0 | 223 | 73 | 22 | 6 |
We are only able to look at the main effects of these categories. Table .. might be indicative of differential effects, but with so few people in each group, finding effects will be difficult.
splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Urban)][order(Urban)][N > 4] %>%
kable(digits = 0, caption = "By urbanicity") %>%
kable_styling(full_width = FALSE) -> urbtab
splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Age = cut(Age, breaks = c(15, 18, 35, 55, 77)))][order(Age)][N > 4] %>%
kable(digits = 0, caption = "By age") %>%
kable_styling(full_width = FALSE) -> agetab
splitdat[, .(mean = mean(`Trip Time`), min = min(`Trip Time`), max = max(`Trip Time`), sd = sd(`Trip Time`), med = median(`Trip Time`), N = uniqueN(device_id)), .(Sex)][order(Sex)][N > 4] %>%
kable(digits = 0, caption = "By sex") %>%
kable_styling(full_width = FALSE) -> sextab
cat(c('<table><tr valign="top"><td>', urbtab, '</td><td>', agetab, '</td><td>', sextab, '</td><tr></table>'),
sep = '')
|
|
|
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
If we consider the number of people who sent us at least some location data, our N per category grows significantly. We can consider this the total possible N to achieve without considering non-response.
data[, username := as.numeric(username)]
data[spss, on = .(username == spssusername), `:=`(Sex = Geslacht, Age = LFT, Urbanicity = STEDBUURT)]
data[, Age := as.numeric(Age)]
data[, AgeC := cut(Age, breaks = c(14, 18, 35, 55, 91))]
# data[, Sex := factor(Sex, levels = c(1, 2), labels = c("Male", "Female"))]
data[, Urban := factor(Urbanicity, levels = levels(splitdat$Urbanicity), labels = c("Very high", "High", "Moderate", "Slight", "Rural", "Unknown"))]
data[, .(`Cell count` = uniqueN(username)), .(Urban, AgeC, Sex)][order(Urban, AgeC, Sex)] %>% kable() %>% kable_styling(full_width = F)
Urban | AgeC | Sex | Cell count |
---|---|---|---|
Very high | (14,18] | 1 | 4 |
Very high | (14,18] | 2 | 2 |
Very high | (18,35] | 1 | 28 |
Very high | (18,35] | 2 | 30 |
Very high | (35,55] | 1 | 20 |
Very high | (35,55] | 2 | 17 |
Very high | (55,91] | 1 | 13 |
Very high | (55,91] | 2 | 13 |
High | (14,18] | 1 | 5 |
High | (14,18] | 2 | 5 |
High | (18,35] | 1 | 19 |
High | (18,35] | 2 | 21 |
High | (35,55] | 1 | 20 |
High | (35,55] | 2 | 33 |
High | (55,91] | 1 | 24 |
High | (55,91] | 2 | 17 |
Moderate | (14,18] | 1 | 3 |
Moderate | (14,18] | 2 | 7 |
Moderate | (18,35] | 1 | 13 |
Moderate | (18,35] | 2 | 16 |
Moderate | (35,55] | 1 | 20 |
Moderate | (35,55] | 2 | 21 |
Moderate | (55,91] | 1 | 17 |
Moderate | (55,91] | 2 | 17 |
Slight | (14,18] | 1 | 2 |
Slight | (14,18] | 2 | 5 |
Slight | (18,35] | 1 | 12 |
Slight | (18,35] | 2 | 13 |
Slight | (35,55] | 1 | 22 |
Slight | (35,55] | 2 | 20 |
Slight | (55,91] | 1 | 16 |
Slight | (55,91] | 2 | 21 |
Rural | (14,18] | 1 | 5 |
Rural | (14,18] | 2 | 4 |
Rural | (18,35] | 1 | 5 |
Rural | (18,35] | 2 | 12 |
Rural | (35,55] | 1 | 13 |
Rural | (35,55] | 2 | 17 |
Rural | (55,91] | 1 | 18 |
Rural | (55,91] | 2 | 6 |
McCool et al. (2022) describes a method for imputation of aggregate travel characteristics that preserves the maximum amount of data. It suggests a two-step process in which small gaps are first interpolated, and then long gaps are imputed on the basis of how similar the time series of the travel behavior is in a process called Dynamic Time Warping-Based Imputation (DTWBI). In our data, the majority (75%) of gaps are under 30 minutes in length.
data2 <- interpolatePoints(data, min_time = minutes(3), max_time = minutes(30), min_dist = -1)
data2[, newttn := shift(timestamp, n = -1) - timestamp, person_day]
data2[, gap := TRUE]
data2[newttn < minutes(5), gap := FALSE]
sg_complete <-
copy(data2[gap == FALSE, per_day_coverage := sum(newttn), person_day][per_day_coverage > minutes(1368)])
sg_complete_pd <- sg_complete[, unique(person_day)]
for (i in seq_along(sg_complete_pd)) {
this_person_day <- sg_complete_pd[i]
res <-
stopFinder(copy(sg_complete[person_day == this_person_day]),
thetaD = 200,
thetaT = minutes(5))
mergingCycle(
res,
thetaD = 200,
small_track_action = "exclude",
max_time = minutes(5),
max_locs = Inf,
max_dist = 200
)
events <- returnStateEvents(res)
if (any(events$state == "moving")) {
segs <- getSegments(tdtr(
res[state == "moving"],
col_names = list(
entity_id_col = "device_id",
timestamp_col = "timestamp",
latitude_col
= "latitude",
longitude_col = "longitude"
),
group_col = "move_id",
max_error = 30
),
group = TRUE)
set(
events,
j = c(
"total_travel_dist",
"total_trip_time",
"avg_trip_time",
"n_trips"
),
value = list(
sum(segs[["segdist"]]),
events[state == "moving", sum(time_length(as.duration(end_time - begin_time), "minutes"))],
events[state == "moving", mean(time_length(as.duration(end_time - begin_time), "minutes"))],
max(events[["move_id"]], na.rm = TRUE)
)
)
events[, avg_dist_per_trip := total_travel_dist / n_trips]
# events[, total_travel_dist := sum(segs$segdist)]
# events[, n_trips := max(move_id, na.rm = TRUE)]
# events[, total_trip_time := sum(end_time[state == "moving"] - begin_time[state == "moving"])]
# events[, avg_trip_time := mean(end_time[state == "moving"] - begin_time[state == "moving"])]
} else {
set(
events,
j = c(
"total_travel_dist",
"avg_dist_per_trip",
"total_trip_time",
"avg_trip_time",
"n_trips"
),
value = list(0)
)
}
if (any(events$state == "stopped")) {
events[, n_stops := max(stop_id, na.rm = TRUE)]
} else {
set(events,
j = c("n_stops"),
value = list(0))
}
sg_complete[person_day == this_person_day,
c(
"total_travel_dist",
"avg_dist_per_trip",
"total_trip_time",
"avg_trip_time",
"n_trips",
"n_stops",
"rog"
) :=
c(unique(events[, .(
total_travel_dist,
avg_dist_per_trip,
total_trip_time,
avg_trip_time,
n_trips,
n_stops
)]),
radiusOfGyrationDT(latitude, longitude, timestamp))]
}
sg_complete[, day_of_week := na.exclude(day_of_week)[1], person_day]
tabdat_sg_day <-
unique(sg_complete[, .(
day_of_week,
n_trips,
n_stops,
total_travel_dist,
avg_dist_per_trip,
total_trip_time,
avg_trip_time,
rog
), person_day])
Day | Total Distance | Mean Distance/Trip | Total Travel Time | Mean Time/Trip | Trips | Stops | Radius of Gyration |
---|---|---|---|---|---|---|---|
Sun | 27.8 km | 8.6 km | 34 min | 11 min | 2.6 | 4.2 | 4.4 km |
Mon | 48.6 km | 11.3 km | 58 min | 14 min | 4.0 | 6.0 | 6.5 km |
Tue | 44.6 km | 12.8 km | 52 min | 14 min | 3.6 | 5.1 | 7.3 km |
Wed | 53.6 km | 14.1 km | 52 min | 13 min | 3.6 | 5.5 | 7.2 km |
Thu | 55.7 km | 12.8 km | 66 min | 14 min | 4.0 | 5.8 | 7.5 km |
Fri | 50.8 km | 10.8 km | 60 min | 13 min | 4.5 | 6.5 | 7.0 km |
Sat | 49.0 km | 11.4 km | 56 min | 13 min | 4.3 | 6.2 | 6.1 km |
Day | Total Distance | Mean Distance/Trip | Total Travel Time | Mean Time/Trip | Trips | Stops | Radius of Gyration |
---|---|---|---|---|---|---|---|
Sun | 39.6 km | 12.7 km | 34 min | 12 min | 2.3 | 2.9 | 9.8 km |
Mon | 81.1 km | 21.0 km | 63 min | 11 min | 3.3 | 3.9 | 12.0 km |
Tue | 56.8 km | 20.9 km | 49 min | 15 min | 2.5 | 2.9 | 13.7 km |
Wed | 87.5 km | 27.6 km | 51 min | 13 min | 2.8 | 3.2 | 13.9 km |
Thu | 82.6 km | 18.5 km | 75 min | 13 min | 3.9 | 4.1 | 10.6 km |
Fri | 71.7 km | 16.5 km | 58 min | 12 min | 3.7 | 4.4 | 11.8 km |
Sat | 61.0 km | 17.4 km | 50 min | 12 min | 3.1 | 3.7 | 10.7 km |
Using only interpolation, we are able to use 604 days across 160 persons. We are interested in what can be gained from also filling in the long gaps. DTWBI looks at the pattern of movement behavior over time, aggregating it into time series with the goal of finding similar patterns. When there exists a long gap within a respondent’s day, in order to generate informative data against which we can compare, we need a buffer of sufficient length on either side of the missing data. The length of this buffer can vary – the shorter the buffer, the less specific we are when selecting imputation candidates. For our purposes, a buffer of at least one hour is necessary.
Because respondents may have multiple gaps within a 24-hour period and because some gaps may occur at the beginning or end of the period, not all days can be imputed with DTWBI. We require that the total sum of covered time before and after the gap is at least 110 minutes. To reduce our workload, if we can calculate that we will not have at least 95% of a full day covered (1368 minutes), we will not impute that day. It might be possible in the future to include these days so that they can become reference queries for imputation, even if they will lack sufficient information to be considered themselves.
data2[gap == FALSE, new_per_day_coverage := sum(newttn), person_day]
data2[, new_complete := FALSE]
data2[new_per_day_coverage >= minutes(1368), new_complete := TRUE]
data2[new_complete == FALSE, candtw := bufferTest(.SD, hours(1), minutes(1368)), person_day]
Requiring a buffer on both sides for DTWBI only provides us with 187 new days. This is suboptimal, and it occurs because we have prematurely divided our dataset into days. If instead, we maintained the data sets as uninterrupted streams until after the imputation, we could often get the data on both sides of, for example, night missingness. We have two options in this case: 1. DTWBI for beginning/end missingness, which incorporates match.buffers on only one side, or 2. perform DTWBI before the data are split on days, then split after imputation.
# data2[, lastttn := (hms::as_hms("04:00:00") - hms::as_hms(timestamp[.N])), person_day]
# data2[lastttn < 0, lastttn := lastttn + as.duration(hours(24))]
# data2[is.na(newttn), newttn := lastttn, person_day]
A problem we run into with solution 2 is that many of the long gaps (which could be handled with DTWBI) are interspersed with single measurements, which keeps us from having the full length of matching buffer. Essentially having those extra pieces of data keep the model from working. The figure below illustrates this issue. The black line shows the gap time, and the color on either side of it shows the one hour match buffer. The best solution here is likely to combine the intersecting gaps into a single gap, making note of the distance that is noted (if any) when we receive the single points flanked by long gaps. Then handle as necessary.
data3 <- copy(data2[, .(device_id, latitude, longitude, accuracy, timestamp, timestamp_date, adj_date, day_of_week, person_day)])
data3[, tfp := timestamp - shift(timestamp, n = 1), device_id]
# data3[, tfp := shift(ttn)]
data3[, newid := rleid(device_id, !is.na(tfp) & ifelse(tfp > hours(12), tfp, 0))]
data3[, person_block := paste(device_id, newid, sep = "_")]
data3[, ttn := shift(timestamp,n = -1) - timestamp, person_block]
data3[, gap := TRUE]
data3[ttn < minutes(5), gap := FALSE]
test <- data3[person_block == "100_1"]
gaps <- test[gap == TRUE, .(person_block, gapid = .I, gaptime = timestamp, ttn, start = timestamp - hours(1), end = timestamp + ttn + hours(1))]
ggplot(gaps, aes(x = gaptime, y = gapid)) +
geom_linerange(aes(xmin = start, xmax = end, color = factor(gapid))) +
geom_linerange(aes(xmin = gaptime, xmax = gaptime + ttn), fill = "black", size = 1) +
scale_x_datetime("Time")+
scale_y_continuous("Gap number")+
scale_color_discrete("Gap number")+
theme_minimal() +
theme(legend.position = "none")
gaps <- data3[gap == TRUE, .(gapid = 1:.N, gaptime = timestamp, start = timestamp - hours(1), end = timestamp + ttn + hours(1)), person_block]
gaps[, newgap := mergeGap(end, start), person_block]
gaps[, `:=`(newstart = min(start), newend = max(end, na.rm = TRUE)), .(person_block, newgap)]
gaps[start == newstart, forebuffer_end := gaptime]
gaps[end == newend, endbuffer_start := gaptime]
gaps[, `:=`(endbuffer_start = na.exclude(endbuffer_start)[1], forebuffer_end = na.exclude(forebuffer_end)[1]), .(person_block, newgap)]
data3[gaps,
on = .(person_block, timestamp >= newstart, timestamp <= newend),
`:=`(newgap = i.newgap, newstart = i.newstart, newend = i.newend, forebuffer_end = i.forebuffer_end, endbuffer_start = i.endbuffer_start)]
data3[, extra_point := FALSE]
data3[!is.na(newgap) & gap == FALSE, extra_point := timestamp %between% c(forebuffer_end[1], endbuffer_start[1]), .(person_block, newgap)]
data3[!is.na(newgap), candtw := testBufferSingleGap(.SD), .(person_block, newgap)]
data3[!is.na(newgap), atleast24h := testBufferAfterMerge(.SD, min.day.minutes = minutes(1368)), .(person_block)]
data3[gap == TRUE | extra_point == TRUE, interdist := sum(geodist_vec(longitude, latitude, sequential = TRUE, measure = "haversine")), .(person_block, newgap)]
data3[!is.na(newgap), interdist := na.exclude(interdist)[1], .(person_block, newgap)]
data3[, newgaplength := (endbuffer_start[1] - forebuffer_end[1]), .(person_block, newgap)]
data3[newgaplength > hours(12), candtw := FALSE]
data3[, aftermergeid := paste(person_block, rleid(newgap), sep = "_"), .(person_block)]
If we merge the long gaps, we end up with 2354 long gaps that can be imputed. DTWBI requires that we create uniform discrete time, aggregating the travel behavior within to that level. We have chosen 15 minutes to strike a balance between the periods being short enough to represent differential travel behavior and long enough to ensure that they encompass multiple location measurements when the minimum gap is set to 5 minutes in length.
setorder(data3, device_id, timestamp)
data3[, hasmissings := any(gap == TRUE), aftermergeid]
full <- copy(data3[hasmissings == FALSE | candtw == TRUE]) # we lose about 1m records and 4000 person_blocks containing not enough information
full[, time_elapsed := timestamp - timestamp[1], aftermergeid]
full[, ftchunk := floor(as.period(time_elapsed)/minutes(15))]
full[, fifteens := sprintf("%04d", ftchunk)]
full[, grouping := paste(aftermergeid, fifteens, sep = "-")]
setorder(full, aftermergeid, ftchunk)
started.at=proc.time()
res <- tdtr(full, col_names = list(entity_id_col = "aftermergeid", timestamp_col = "timestamp", latitude_col
= "latitude", longitude_col = "longitude"), group_col = "grouping", max_error = 30)
cat("Finished in",timetaken(started.at),"\n") ## 13 minutes
saveRDS(res, "./data/tdtrdata_221810.RDS")
segs <- getSegments(res, group = TRUE)
saveRDS(segs, "./data/segs_221810.RDS")
We add in the missing time series entries, setting the distance to NA.
segs <- readRDS("./data/segs_221810.RDS")
setDT(segs)
databyfifteen <- segs[, .(tsstart = min(seg_start_time),
tsend = max(seg_end_time),
dist = sum(segdist),
tsstartlat = seg_start_lat[1],
tsendlat = seg_end_lat[.N],
tsstartlon = seg_start_lon[1],
tsendlon = seg_end_lon[.N]),
.(entity_id, .id)]
databyfifteen[, c("person_block", "fifteens") := tstrsplit(.id, split = "-")]
databyfifteen[, ftchunk := as.numeric(fifteens)]
databyfifteen[, tsstart := as_datetime(tsstart)]
databyfifteen[, tsend := as_datetime(tsend)]
databyfifteen[, first_timestamp := (min(tsstart)), entity_id]
missingfifteens <- databyfifteen[, seq(from = 0, to = max(ftchunk))[!seq(from = 0, to = max(ftchunk)) %in% ftchunk], entity_id]
missingfifteens[, `:=`(lastbefore = min(V1) - 1,
firstafter = max(V1) + 1), entity_id]
newrows <- databyfifteen[missingfifteens, on = .(entity_id, ftchunk == lastbefore),
.(entity_id,
.id = paste(entity_id, sprintf("%04d", i.V1), sep = "-"),
tsstart = first_timestamp + minutes(i.V1 * 15) ,
tsend = first_timestamp + minutes((i.V1 + 1) * 15),
dist = NA,
tsstartlat = x.tsendlat,
tsendlat = NA_real_,
tsstartlon = x.tsendlon,
tsendlon = NA_real_,
person_block,
fifteens = sprintf("%04d", i.V1),
ftchunk = i.V1,
first_timestamp,
firstafter)]
newrows[databyfifteen, on = .(entity_id, firstafter == ftchunk), `:=`(tsendlat = i.tsstartlat, tsendlon = i.tsstartlon)]
set(newrows, j = "firstafter", value = NULL)
databyfifteen <- rbindlist(list(
databyfifteen, newrows
))
setorder(databyfifteen, entity_id, ftchunk)
databyfifteen[, first_timestamp := NULL]
databyfifteen[, time := hms::as_hms(tsstart)]
databyfifteen[, maxftchunk := max(ftchunk), entity_id]
This gives us 4928 complete or imputable slices of varying length. The next step is imputation using DTWBI with the Burst parameters. First we separate out the ‘queries’ which are the sets containing long gap missing data, and we order those by the amount of missingness.
n_imps <- 10
prob.power <- 4
window.size = hours(3)
match.window = 4
impdata <-
copy(rbindlist(replicate(n_imps, databyfifteen, simplify = FALSE), idcol = "imp"))
impdata[, orig_dist := dist]
queries <- impdata[imp == 1, .SD[any(is.na(dist))], entity_id]
query_order <- queries[, sum(is.na(dist)), entity_id][order(V1, decreasing = FALSE), entity_id]
library(doParallel)
detectCores()
## [1] 16
registerDoParallel(10)
allimps <- foreach(
this_imp = seq_len(n_imps),
.combine = rbind,
.inorder = FALSE,
.multicombine = TRUE
) %dopar% {
for (rechain_nr in seq_len(2)) {
cat("Chain number", rechain_nr, "\n", file = "")
# for (i in seq_along(query_order)) {
for (i in 2) {
refs <-
impdata[imp == this_imp &
entity_id != query_order[i], # Select only those for this imputation chain
.SD[!any(is.na(dist))], # That are complete (includes previously imputed)
entity_id] # Grouped by entity_id so we get only those with no NAs when we use any()
this_query <-
queries[entity_id == query_order[i]] # Select the query for innermost loop
miss <-
this_query[, is.na(dist)] # Extract the vector of missingness
obs <-
!miss # The inverse observed vector
# query <- this_query
impdata <- imputeMidmissing(
impdata = impdata,
this_imp = this_imp,
query = this_query,
obs = obs,
refs = refs,
n_imps = 1,
prob.power = prob.power,
window = window.size,
match.window = match.window
)
} # End query iteration
} # End chaining iteration
impdata[imp == this_imp]
} # End %dopar% over n_imps
allimps[, `:=`(
window.size = window.size,
prob.power = prob.power,
match.window = match.window
)]
allimps[is.na(orig_dist) & !is.na(dist)]
imp <int> | entity_id <chr> | .id <chr> | tsstart <dttm> | tsend <dttm> | dist <dbl> | |
---|---|---|---|---|---|---|
1 | 199_980_10 | 199_980_10-0004 | 2018-11-06 09:50:42.438 | 2018-11-06 10:05:42.438 | 0.000000 | |
2 | 199_980_10 | 199_980_10-0004 | 2018-11-06 09:50:42.438 | 2018-11-06 10:05:42.438 | 3.696013 | |
3 | 199_980_10 | 199_980_10-0004 | 2018-11-06 09:50:42.438 | 2018-11-06 10:05:42.438 | 154.512946 | |
4 | 199_980_10 | 199_980_10-0004 | 2018-11-06 09:50:42.438 | 2018-11-06 10:05:42.438 | 8.658444 | |
5 | 199_980_10 | 199_980_10-0004 | 2018-11-06 09:50:42.438 | 2018-11-06 10:05:42.438 | 0.939696 | |
6 | 199_980_10 | 199_980_10-0004 | 2018-11-06 09:50:42.438 | 2018-11-06 10:05:42.438 | 128.711587 | |
7 | 199_980_10 | 199_980_10-0004 | 2018-11-06 09:50:42.438 | 2018-11-06 10:05:42.438 | 2.530084 | |
8 | 199_980_10 | 199_980_10-0004 | 2018-11-06 09:50:42.438 | 2018-11-06 10:05:42.438 | 9.753136 | |
9 | 199_980_10 | 199_980_10-0004 | 2018-11-06 09:50:42.438 | 2018-11-06 10:05:42.438 | 360.101091 | |
10 | 199_980_10 | 199_980_10-0004 | 2018-11-06 09:50:42.438 | 2018-11-06 10:05:42.438 | 703.494775 |