splits
splits.Rmd
Get segment labels
Splits, or segments, measure at what time a game event happens, and how long a player took to get there from their previous segment.
Unique split id?
Although segments are chosen by a program, so far we’ve only found the player labels for each split or segment. Players can define their own segments and labels, too, so we don’t want to confuse those points with others.
On the other hand we don’t want to keep bomb
and throw
out all the bombs
with the bathwater.
labels_dat <-
# pull columns for this analysis from splits data
sio_splits_df %>%
select(player_name, game_event, name, segment_number, run_id, player_id) %>%
mutate(
# create a lower-case label
label = tolower(game_event)
)
labels_dat %>% head()
#> player_name game_event name segment_number run_id
#> 0 anatomecha Morphing Ball Morphing Ball 0 ato1
#> 1 anatomecha First Missiles First Missiles 1 ato1
#> 2 anatomecha Bomb Bomb 2 ato1
#> 3 anatomecha First Super Missiles First Super Missiles 3 ato1
#> 4 anatomecha Charge Beam Charge Beam 4 ato1
#> 5 anatomecha Spazer Spazer 5 ato1
#> player_id label
#> 0 89646 morphing ball
#> 1 89646 first missiles
#> 2 89646 bomb
#> 3 89646 first super missiles
#> 4 89646 charge beam
#> 5 89646 spazer
Most common labels
label_counts <-
labels_dat %>%
mutate(
starts_with = str_sub(label, 1, 1)
) %>%
count(starts_with, label) %>%
arrange(starts_with, desc(n))
label_counts %>%
ungroup() %>%
select(-starts_with) %>%
arrange(desc(n)) %>%
filter(n > 50)
#> label n
#> 1 space jump 411
#> 2 screw attack 360
#> 3 bombs 339
#> 4 gravity suit 326
#> 5 phantoon 294
#> 6 plasma beam 271
#> 7 speed booster 262
#> 8 varia suit 259
#> 9 spring ball 257
#> 10 wave beam 252
#> 11 ice beam 248
#> 12 charge beam 231
#> 13 grapple beam 207
#> 14 spazer 203
#> 15 x-ray 203
#> 16 golden 4 154
#> 17 varia 154
#> 18 morph ball 153
#> 19 ridley e-tank 143
#> 20 ridley 142
#> 21 g4 119
#> 22 ridley's e-tank 115
#> 23 x-ray scope 115
#> 24 botwoon 113
#> 25 turtle e-tank 113
#> 26 grapple 112
#> 27 golden four 105
#> 28 gravity 105
#> 29 escape! 96
#> 30 kraid 94
#> 31 mother brain 93
#> 32 draygon 91
#> 33 escape 91
#> 34 kraid missiles 84
#> 35 hi-jump boots 82
#> 36 crocomire 72
#> 37 grappling beam 72
#> 38 bomb torizo 71
#> 39 ss supers 71
#> 40 any% pbs 70
#> 41 plasma 70
#> 42 crab supers 66
#> 43 ice 63
#> 44 boots 62
#> 45 early supers 59
#> 46 watering hole supers 59
#> 47 golden torizo 57
#> 48 green hill missiles 57
#> 49 kraid e-tank 56
#> 50 etecoon supers 53
# write labels to sheet for labelling
# https://docs.google.com/spreadsheets/d/1FRHsS7WQvVhI5_yP-twHCJW2YRJRbeKg4P42EY3Jzrk/edit#gid=1079532092
write_csv(label_counts, "data-raw/sio-label-counts.csv")
relabels <- googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1FRHsS7WQvVhI5_yP-twHCJW2YRJRbeKg4P42EY3Jzrk/edit#gid=1079532092")
write_csv(relabels, "data-raw/sio-relabels.csv")
Add labels
relabels <- read_csv("../data-raw/sio-relabels.csv") %>%
select(label, supermetroid_label) %>%
distinct()
#> Rows: 2563 Columns: 4
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (3): starts_with, label, supermetroid_label
#> dbl (1): n
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sio_relabelled <-
sio_splits_df %>%
mutate(
label = tolower(game_event)
) %>%
left_join(relabels, by = "label") %>%
# replace label values
mutate(label = if_else(!is.na(supermetroid_label), supermetroid_label, label)) %>%
ungroup() %>%
select(-game_event, -name) %>%
select(contains("label"), player_name, contains("realtime"), everything())
# there now, that's looking better
head(sio_relabelled)
#> label supermetroid_label player_name realtime_start_ms
#> 1 morph ball morph ball anatomecha 0
#> 2 first missiles <NA> anatomecha 200835
#> 3 bombs bombs anatomecha 224926
#> 4 first super missiles <NA> anatomecha 358321
#> 5 charge beam charge beam anatomecha 589662
#> 6 spazer spazer anatomecha 709962
#> realtime_duration_ms realtime_end_ms realtime_shortest_duration_ms
#> 1 200835 200835 200835
#> 2 24091 224926 23286
#> 3 133395 358321 129371
#> 4 231341 589662 227335
#> 5 120300 709962 120299
#> 6 97876 807838 95884
#> realtime_gold segment_number segment_id run_id
#> 1 TRUE 0 69d90da3-1634-4af4-9096-419a32bedd0a ato1
#> 2 FALSE 1 fad50319-d55b-460e-923a-c416f233daea ato1
#> 3 FALSE 2 089daf0b-64c3-4eec-8902-022335b2b179 ato1
#> 4 FALSE 3 2a215dcd-2989-4939-8558-a7215904dfd1 ato1
#> 5 FALSE 4 e906991d-e919-47c1-a809-2b4da9b67461 ato1
#> 6 FALSE 5 10439232-43c7-4752-afb9-ee23782da4a9 ato1
#> player_id
#> 1 89646
#> 2 89646
#> 3 89646
#> 4 89646
#> 5 89646
#> 6 89646
# check out counts now
sio_relabelled %>%
count(label, supermetroid_label) %>%
arrange(desc(n)) %>%
filter(n > 50) %>%
gt()
label | supermetroid_label | n |
---|---|---|
space jump | space jump | 450 |
varia suit | varia suit | 425 |
grapple beam | grapple beam | 405 |
screw attack | screw attack | 404 |
bombs | bombs | 399 |
x-ray | x-ray | 359 |
plasma beam | plasma beam | 342 |
gravity suit | gravity suit | 334 |
ice beam | ice beam | 324 |
phantoon | phantoon | 323 |
speed booster | speed booster | 313 |
spring ball | spring ball | 289 |
golden 4 | golden 4 | 286 |
wave beam | wave beam | 283 |
charge beam | charge beam | 232 |
spazer | spazer | 218 |
hi-jump boots | hi-jump boots | 212 |
morph ball | morph ball | 208 |
ridley | ridley | 182 |
ridley e-tank | NA | 143 |
spore spawn supers | spore spawn supers | 136 |
draygon | draygon | 121 |
ridley's e-tank | NA | 115 |
kraid | kraid | 114 |
botwoon | NA | 113 |
turtle e-tank | NA | 113 |
golden four | NA | 105 |
gravity | NA | 105 |
crocomire | crocomire | 103 |
mother brain | mother brain | 99 |
escape! | NA | 96 |
first supers | first supers | 92 |
escape | NA | 91 |
kraid missiles | NA | 84 |
bomb torizo | NA | 71 |
any% pbs | NA | 70 |
crab supers | NA | 66 |
watering hole supers | NA | 59 |
golden torizo | NA | 57 |
green hill missiles | NA | 57 |
kraid e-tank | NA | 56 |
etecoon supers | NA | 53 |
sio_df <- sio_relabelled %>% select(-supermetroid_label) %>%
rename(game_event = label)
sio_df %>% head()
#> game_event player_name realtime_start_ms realtime_duration_ms
#> 1 morph ball anatomecha 0 200835
#> 2 first missiles anatomecha 200835 24091
#> 3 bombs anatomecha 224926 133395
#> 4 first super missiles anatomecha 358321 231341
#> 5 charge beam anatomecha 589662 120300
#> 6 spazer anatomecha 709962 97876
#> realtime_end_ms realtime_shortest_duration_ms realtime_gold segment_number
#> 1 200835 200835 TRUE 0
#> 2 224926 23286 FALSE 1
#> 3 358321 129371 FALSE 2
#> 4 589662 227335 FALSE 3
#> 5 709962 120299 FALSE 4
#> 6 807838 95884 FALSE 5
#> segment_id run_id player_id
#> 1 69d90da3-1634-4af4-9096-419a32bedd0a ato1 89646
#> 2 fad50319-d55b-460e-923a-c416f233daea ato1 89646
#> 3 089daf0b-64c3-4eec-8902-022335b2b179 ato1 89646
#> 4 2a215dcd-2989-4939-8558-a7215904dfd1 ato1 89646
#> 5 e906991d-e919-47c1-a809-2b4da9b67461 ato1 89646
#> 6 10439232-43c7-4752-afb9-ee23782da4a9 ato1 89646
# not run
usethis::use_data(sio_df, overwrite=TRUE)
Selected splits
Select most common splits used by players.
sio_df %>%
group_by(game_event) %>%
summarise(
n_runners = n_distinct(player_id)
) %>%
filter(n_runners < 20) %>%
ggplot(aes(x=n_runners)) +
geom_histogram(fill=sm_col_h$label) +
theme_sm() +
labs(title = "Counts of runners for splits")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
common_splits <-
# get standard order - recorded by at least 30 runners
sio_df %>%
group_by(game_event) %>%
summarise(
n_runners = n_distinct(player_id)
) %>%
filter(n_runners >= 30) %>%
arrange(desc(n_runners))
common_splits
#> # A tibble: 19 × 2
#> game_event n_runners
#> <chr> <int>
#> 1 varia suit 80
#> 2 bombs 75
#> 3 space jump 75
#> 4 grapple beam 72
#> 5 screw attack 68
#> 6 x-ray 68
#> 7 ice beam 63
#> 8 plasma beam 58
#> 9 speed booster 58
#> 10 phantoon 56
#> 11 gravity suit 55
#> 12 golden 4 54
#> 13 wave beam 49
#> 14 spring ball 45
#> 15 charge beam 37
#> 16 morph ball 37
#> 17 hi-jump boots 35
#> 18 spazer 33
#> 19 ridley 30
# not run
usethis::use_data(common_splits, overwrite=TRUE)
What routes do people take?
Oh this code is all borked and from early experimental phase from here. Waiting to copy-paste, etc.
routes_raw <-
sio_df %>%
select(supermetroid_label, player_name, split_order = segment_number, realtime_end_ms, run_id, player_id) %>%
# get just the universal splits
filter(!is.na(supermetroid_label))
#> Error in `select()`:
#> ! Can't subset columns that don't exist.
#> ✖ Column `supermetroid_label` doesn't exist.
splits_orders <-
routes_raw %>%
select(supermetroid_label, split_order, run_id)
#> Error in eval(expr, envir, enclos): object 'routes_raw' not found
nested_splits <-
splits_orders %>%
arrange(run_id, split_order) %>%
select(-split_order) %>%
group_by(run_id) %>%
nest(splits = supermetroid_label) %>%
ungroup() %>%
group_by(splits) %>%
nest(runs = run_id) %>%
ungroup() %>%
mutate(
route_length = map_int(splits, nrow),
route_id = str_c("route ", 1:n())
) %>%
arrange(desc(route_length))
#> Error in eval(expr, envir, enclos): object 'splits_orders' not found
nested_splits %>%
head() %>%
gt()
#> Error in eval(expr, envir, enclos): object 'nested_splits' not found
routes <-
nested_splits %>%
# head() %>%
mutate(event = map(splits, \(df) left_join(df, common_splits, by = "supermetroid_label")),
route_anatomecha = map(event, \(df) df %>% filter(!is.na(split_anatomecha)) %>% pull(split_anatomecha)),
route_standard = map_lgl(route_anatomecha,
\(x) all(sort(x) == x))
)
#> Error in eval(expr, envir, enclos): object 'nested_splits' not found
time vs cumulative time by splits
# dashboard this for y axis
# dashboard this for x axis, too selector tool with tick boxes, select all, clear
# plot <-
sio_df %>%
left_join(common_splits, by = "supermetroid_label") %>%
filter(!is.na(supermetroid_label)) %>%
mutate(split_anatomecha = as_factor(split_anatomecha),
t_h = realtime_end_ms/1000/60
) %>%
ggplot(aes(x = split_anatomecha, y = t_h, group=run_id)) +
geom_line(colour=sm_cols$orange, alpha = 0.2) +
geom_point(colour=sm_cols$orange, alpha = 0.2) +
theme_sm(base_size = 30) +
labs(
title = "Players take different routes",
subtitle = "Real time completion of each game event would be
always increasing if all players took the same route as anatomecha" %>% str_wrap(60),
x = "Game event (loot, boss, etc.)" %>% str_wrap(12),
y = "Real time completion (minutes)" %>% str_wrap(20),
caption = "splits.io"
) +
theme(
axis.text.x = element_text(angle=90)
) +
scale_x_discrete(labels=c("start", game_event)) +
facet_grid(route_standard ~. )
#> Error in `left_join()`:
#> ! Join columns in `x` must be present in the data.
#> ✖ Problem with `supermetroid_label`.
# ggplotly(plot)
# selected_split <- "ice beam"
selected_split <- "ice beam"# sample(game_event, 1)
rank <- c(1,25)
#xlim <- c(0.8, 1.2)
# +
# xlim(xlim[1], xlim[2])
Towards the best route
identifying clusters
Try knn
Wide-form array: - columns as events - rows as routes - values as mean event time
I think this should work on individual observations.
- Get an array of segments
- Set an n-fold cross validation
- Evaluated the accuracy of the KNn classifer with different values of k by cross validation
# get an array of segments
identifying clusters
Try taking the mean value of each event of each route, then imputing the missing values from local means. How to impute via local mean? That is, need to impute, conditional on the other values in the array. Need to read about imputation. Perhaps I should just write my own imputation, I think I could calculate that. Then apply hierarchical clustering to find out which.
# create wideform array
routes_wide <-
routes_raw %>%
rename(game_event = supermetroid_label) %>%
group_by(run_id) %>%
filter(realtime_end_ms == max(realtime_end_ms),
game_event %in% game_event
) %>%
ungroup() %>%
distinct() %>%
# dplyr::group_by(run_id, game_event) %>%
# dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
# dplyr::filter(n > 1L)
pivot_wider(id_cols = "run_id",
names_from = "game_event",
values_from = "realtime_end_ms")
#> Error in eval(expr, envir, enclos): object 'routes_raw' not found
library(caret)
#> Loading required package: lattice
#>
#> Attaching package: 'caret'
#> The following object is masked from 'package:purrr':
#>
#> lift
trControl <- trainControl(method = "cv",
number = 5)
fit <- train(supermetroid_label ~ realtime_end_ms,
method = "knn",
tuneGrid = expand.grid(k = 1:10),
trControl = trControl,
metric = "Accuracy",
data = routes_raw)
#> Error in eval(expr, p): object 'routes_raw' not found
routes_wide <-
routes %>%
select(route_id, runs) %>%
unnest(runs) %>%
left_join(sio_df %>%
filter(split %in% game_event) %>%
select(run_id,
realtime_end_ms, split) %>%
group_by(run_id, split) %>%
filter(realtime_end_ms==max(realtime_end_ms)),
by = "run_id") %>%
group_by(route_id, run_id) %>%
# 0 entries must be corrupt?
filter(realtime_end_ms != 0) %>%
pivot_wider(names_from = "split", values_from = "realtime_end_ms") %>%
janitor::clean_names()
#> Error in eval(expr, envir, enclos): object 'routes' not found
sio_routes_wide <- routes_wide
usethis::use_data(sio_routes_wide)
library(caret)
preProcValues <-
preProcess(routes_wide %>% ungroup() %>% select(-c(run_id, route_id)))
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found
trControl <- trainControl(method = "cv",
number = 5)
# routes_wide %>% names() %>% paste(collapse ="+") %>% cat()
fit <-
train(
route_id ~ morph_ball + bombs +
charge_beam + varia_suit + speed_booster +
wave_beam + grapple_beam + ice_beam + gravity_suit +
space_jump + spring_ball +
plasma_beam + screw_attack + spazer +
hi_jump_boots + phantoon + draygon +
ridley + kraid + x_ray + mother_brain + escape,
method = "knn",
tuneGrid = expand.grid(k = 1:10),
trControl = trControl,
metric = "Accuracy",
data = routes_wide
)
#> Error in eval(expr, p): object 'routes_wide' not found
predict(preProcValues)
#> Error in eval(expr, envir, enclos): object 'preProcValues' not found
ice_beam <-
routes_wide %>%
select(route_id, route_id, ice_beam) %>%
group_by(route_id) %>%
summarise(
ice_beam = mean(ice_beam)
) %>%
ungroup() %>%
filter(!is.na(ice_beam))
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found
clusters <- dist(ice_beam) %>% hclust()
#> Error in eval(expr, envir, enclos): object 'ice_beam' not found
plot(clusters, labels = ice_beam$route_id)
#> Error in eval(expr, envir, enclos): object 'clusters' not found
library(ggdendro)
ggdendrogram(clusters)
#> Error in eval(expr, envir, enclos): object 'clusters' not found
# fuckit write csv
write_csv(routes_wide, "dev/routes_wide.csv")
import numpy as np
from sklearn.impute import KNNImputer
routes_wide = pd.read_csv("dev/routes_wide.csv")
#> Error: NameError: name 'pd' is not defined
imputer = KNNImputer()
df_knn_imputed = pd.DataFrame(imputer.fit_transform(routes_wide.drop(columns = ['run_id', 'route_id'])))
#> Error: NameError: name 'pd' is not defined
# set up df of imputed data in R
x_event = py$df_knn_imputed
#> Error in eval(expr, envir, enclos): object 'py' not found
colnames(x_event) <- routes_wide %>% ungroup() %>% select(-c(route_id, run_id)) %>% colnames()
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found
routes_imputed =
routes_wide %>% select(route_id, run_id) %>% bind_cols(x_event)
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found
# try using caret to do knn classification
routes_imputed %>% View()
#> Error in eval(expr, envir, enclos): object 'routes_imputed' not found
routes_scaled <-
routes_imputed %>%
ungroup() %>%
select(-c(route_id, run_id)) %>%
mutate(across(everything(), scale)) %>%
bind_cols(routes_imputed %>% select(route_id, run_id)) %>%
select(route_id, run_id, everything())
#> Error in eval(expr, envir, enclos): object 'routes_imputed' not found
# https://rpubs.com/pmtam/knn
# Run algorithms using 10-fold cross validation
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"
grid <- expand.grid(.k=seq(1,20,by=1))
fit.knn <- train(route_id # ~ ice_beam,
~ # run_id
+ morph_ball + bombs +
charge_beam + varia_suit + speed_booster +
wave_beam + grapple_beam + ice_beam + gravity_suit +
space_jump + spring_ball +
plasma_beam + screw_attack + spazer +
hi_jump_boots + phantoon + draygon +
ridley + kraid + x_ray + mother_brain + escape,
data=routes_scaled %>% select(-run_id), method="knn",
metric=metric ,trControl=trainControl, tuneGrid=grid)
#> Error in eval(expr, p): object 'routes_scaled' not found
knn.k1 <- fit.knn$bestTune # keep this Initial k for testing with knn() function in next section
#> Error in eval(expr, envir, enclos): object 'fit.knn' not found
print(fit.knn)
#> Error in eval(expr, envir, enclos): object 'fit.knn' not found