splits • supermetroid

library(supermetroid)
library(tidyverse)
library(gt)

This document updates splitsio df for the analysis

Get segment labels

Splits, or segments, measure at what time a game event happens, and how long a player took to get there from their previous segment.

Unique split id?

Although segments are chosen by a program, so far we’ve only found the player labels for each split or segment. Players can define their own segments and labels, too, so we don’t want to confuse those points with others.

On the other hand we don’t want to keep bomb and throw out all the bombs with the bathwater.

labels_dat <- 
  # pull columns for this analysis from splits data
  sio_splits_df %>% 
  select(player_name, game_event, name, segment_number, run_id, player_id) %>% 
  mutate(
    # create a lower-case label 
    label = tolower(game_event)
    )
labels_dat %>% head()
#>   player_name           game_event                 name segment_number run_id
#> 0  anatomecha        Morphing Ball        Morphing Ball              0   ato1
#> 1  anatomecha       First Missiles       First Missiles              1   ato1
#> 2  anatomecha                 Bomb                 Bomb              2   ato1
#> 3  anatomecha First Super Missiles First Super Missiles              3   ato1
#> 4  anatomecha          Charge Beam          Charge Beam              4   ato1
#> 5  anatomecha               Spazer               Spazer              5   ato1
#>   player_id                label
#> 0     89646        morphing ball
#> 1     89646       first missiles
#> 2     89646                 bomb
#> 3     89646 first super missiles
#> 4     89646          charge beam
#> 5     89646               spazer

Most common labels

label_counts <- 
labels_dat %>% 
  mutate(
    starts_with = str_sub(label, 1, 1)
  ) %>% 
  count(starts_with, label) %>% 
  arrange(starts_with, desc(n))

label_counts %>% 
  ungroup() %>% 
  select(-starts_with) %>%  
  arrange(desc(n)) %>%
  filter(n > 50)
#>                   label   n
#> 1            space jump 411
#> 2          screw attack 360
#> 3                 bombs 339
#> 4          gravity suit 326
#> 5              phantoon 294
#> 6           plasma beam 271
#> 7         speed booster 262
#> 8            varia suit 259
#> 9           spring ball 257
#> 10            wave beam 252
#> 11             ice beam 248
#> 12          charge beam 231
#> 13         grapple beam 207
#> 14               spazer 203
#> 15                x-ray 203
#> 16             golden 4 154
#> 17                varia 154
#> 18           morph ball 153
#> 19        ridley e-tank 143
#> 20               ridley 142
#> 21                   g4 119
#> 22      ridley's e-tank 115
#> 23          x-ray scope 115
#> 24              botwoon 113
#> 25        turtle e-tank 113
#> 26              grapple 112
#> 27          golden four 105
#> 28              gravity 105
#> 29              escape!  96
#> 30                kraid  94
#> 31         mother brain  93
#> 32              draygon  91
#> 33               escape  91
#> 34       kraid missiles  84
#> 35        hi-jump boots  82
#> 36            crocomire  72
#> 37       grappling beam  72
#> 38          bomb torizo  71
#> 39            ss supers  71
#> 40             any% pbs  70
#> 41               plasma  70
#> 42          crab supers  66
#> 43                  ice  63
#> 44                boots  62
#> 45         early supers  59
#> 46 watering hole supers  59
#> 47        golden torizo  57
#> 48  green hill missiles  57
#> 49         kraid e-tank  56
#> 50       etecoon supers  53


# write labels to sheet for labelling 
# https://docs.google.com/spreadsheets/d/1FRHsS7WQvVhI5_yP-twHCJW2YRJRbeKg4P42EY3Jzrk/edit#gid=1079532092


write_csv(label_counts, "data-raw/sio-label-counts.csv")

relabels <- googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1FRHsS7WQvVhI5_yP-twHCJW2YRJRbeKg4P42EY3Jzrk/edit#gid=1079532092")

write_csv(relabels, "data-raw/sio-relabels.csv")

Add labels

relabels <- read_csv("../data-raw/sio-relabels.csv") %>% 
  select(label, supermetroid_label) %>% 
  distinct()
#> Rows: 2563 Columns: 4
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (3): starts_with, label, supermetroid_label
#> dbl (1): n
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


sio_relabelled <- 
  sio_splits_df %>% 
    mutate(
      label = tolower(game_event)
    ) %>% 
    left_join(relabels, by = "label") %>% 
    # replace label values
    mutate(label = if_else(!is.na(supermetroid_label), supermetroid_label, label)) %>% 
    ungroup() %>% 
    select(-game_event, -name) %>% 
    select(contains("label"), player_name, contains("realtime"), everything())

# there now, that's looking better
head(sio_relabelled)
#>                  label supermetroid_label player_name realtime_start_ms
#> 1           morph ball         morph ball  anatomecha                 0
#> 2       first missiles               <NA>  anatomecha            200835
#> 3                bombs              bombs  anatomecha            224926
#> 4 first super missiles               <NA>  anatomecha            358321
#> 5          charge beam        charge beam  anatomecha            589662
#> 6               spazer             spazer  anatomecha            709962
#>   realtime_duration_ms realtime_end_ms realtime_shortest_duration_ms
#> 1               200835          200835                        200835
#> 2                24091          224926                         23286
#> 3               133395          358321                        129371
#> 4               231341          589662                        227335
#> 5               120300          709962                        120299
#> 6                97876          807838                         95884
#>   realtime_gold segment_number                           segment_id run_id
#> 1          TRUE              0 69d90da3-1634-4af4-9096-419a32bedd0a   ato1
#> 2         FALSE              1 fad50319-d55b-460e-923a-c416f233daea   ato1
#> 3         FALSE              2 089daf0b-64c3-4eec-8902-022335b2b179   ato1
#> 4         FALSE              3 2a215dcd-2989-4939-8558-a7215904dfd1   ato1
#> 5         FALSE              4 e906991d-e919-47c1-a809-2b4da9b67461   ato1
#> 6         FALSE              5 10439232-43c7-4752-afb9-ee23782da4a9   ato1
#>   player_id
#> 1     89646
#> 2     89646
#> 3     89646
#> 4     89646
#> 5     89646
#> 6     89646

# check out counts now
sio_relabelled %>% 
  count(label, supermetroid_label) %>% 
  arrange(desc(n)) %>% 
  filter(n > 50) %>% 
  gt()

label	supermetroid_label	n
space jump	space jump	450
varia suit	varia suit	425
grapple beam	grapple beam	405
screw attack	screw attack	404
bombs	bombs	399
x-ray	x-ray	359
plasma beam	plasma beam	342
gravity suit	gravity suit	334
ice beam	ice beam	324
phantoon	phantoon	323
speed booster	speed booster	313
spring ball	spring ball	289
golden 4	golden 4	286
wave beam	wave beam	283
charge beam	charge beam	232
spazer	spazer	218
hi-jump boots	hi-jump boots	212
morph ball	morph ball	208
ridley	ridley	182
ridley e-tank	NA	143
spore spawn supers	spore spawn supers	136
draygon	draygon	121
ridley's e-tank	NA	115
kraid	kraid	114
botwoon	NA	113
turtle e-tank	NA	113
golden four	NA	105
gravity	NA	105
crocomire	crocomire	103
mother brain	mother brain	99
escape!	NA	96
first supers	first supers	92
escape	NA	91
kraid missiles	NA	84
bomb torizo	NA	71
any% pbs	NA	70
crab supers	NA	66
watering hole supers	NA	59
golden torizo	NA	57
green hill missiles	NA	57
kraid e-tank	NA	56
etecoon supers	NA	53

sio_df <- sio_relabelled %>% select(-supermetroid_label) %>% 
  rename(game_event = label)

sio_df %>% head()
#>             game_event player_name realtime_start_ms realtime_duration_ms
#> 1           morph ball  anatomecha                 0               200835
#> 2       first missiles  anatomecha            200835                24091
#> 3                bombs  anatomecha            224926               133395
#> 4 first super missiles  anatomecha            358321               231341
#> 5          charge beam  anatomecha            589662               120300
#> 6               spazer  anatomecha            709962                97876
#>   realtime_end_ms realtime_shortest_duration_ms realtime_gold segment_number
#> 1          200835                        200835          TRUE              0
#> 2          224926                         23286         FALSE              1
#> 3          358321                        129371         FALSE              2
#> 4          589662                        227335         FALSE              3
#> 5          709962                        120299         FALSE              4
#> 6          807838                         95884         FALSE              5
#>                             segment_id run_id player_id
#> 1 69d90da3-1634-4af4-9096-419a32bedd0a   ato1     89646
#> 2 fad50319-d55b-460e-923a-c416f233daea   ato1     89646
#> 3 089daf0b-64c3-4eec-8902-022335b2b179   ato1     89646
#> 4 2a215dcd-2989-4939-8558-a7215904dfd1   ato1     89646
#> 5 e906991d-e919-47c1-a809-2b4da9b67461   ato1     89646
#> 6 10439232-43c7-4752-afb9-ee23782da4a9   ato1     89646

# not run
usethis::use_data(sio_df, overwrite=TRUE)

Selected splits

Select most common splits used by players.

sio_df %>%
  group_by(game_event) %>% 
  summarise(
    n_runners = n_distinct(player_id)
  ) %>% 
  filter(n_runners < 20) %>%  
  ggplot(aes(x=n_runners)) +
  geom_histogram(fill=sm_col_h$label) +
  theme_sm() +
  labs(title = "Counts of runners for splits")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

common_splits <-
  # get standard order - recorded by at least 30 runners
  sio_df %>%
  group_by(game_event) %>% 
  summarise(
    n_runners = n_distinct(player_id)
  ) %>% 
  filter(n_runners >= 30) %>% 
  arrange(desc(n_runners)) 

common_splits
#> # A tibble: 19 × 2
#>    game_event    n_runners
#>    <chr>             <int>
#>  1 varia suit           80
#>  2 bombs                75
#>  3 space jump           75
#>  4 grapple beam         72
#>  5 screw attack         68
#>  6 x-ray                68
#>  7 ice beam             63
#>  8 plasma beam          58
#>  9 speed booster        58
#> 10 phantoon             56
#> 11 gravity suit         55
#> 12 golden 4             54
#> 13 wave beam            49
#> 14 spring ball          45
#> 15 charge beam          37
#> 16 morph ball           37
#> 17 hi-jump boots        35
#> 18 spazer               33
#> 19 ridley               30

# not run
usethis::use_data(common_splits, overwrite=TRUE)

What routes do people take?

Oh this code is all borked and from early experimental phase from here. Waiting to copy-paste, etc.

routes_raw <- 
sio_df %>% 
  select(supermetroid_label, player_name, split_order = segment_number, realtime_end_ms, run_id, player_id) %>% 
  # get just the universal splits
  filter(!is.na(supermetroid_label))
#> Error in `select()`:
#> ! Can't subset columns that don't exist.
#> ✖ Column `supermetroid_label` doesn't exist.

splits_orders <- 
routes_raw %>% 
  select(supermetroid_label, split_order, run_id)
#> Error in eval(expr, envir, enclos): object 'routes_raw' not found

nested_splits <- 
splits_orders %>% 
  arrange(run_id, split_order) %>% 
  select(-split_order) %>% 
  group_by(run_id) %>% 
  nest(splits = supermetroid_label) %>% 
  ungroup() %>% 
  group_by(splits) %>% 
  nest(runs = run_id) %>% 
  ungroup() %>% 
  mutate(
    route_length = map_int(splits, nrow),
    route_id = str_c("route ", 1:n())
  ) %>% 
  arrange(desc(route_length))
#> Error in eval(expr, envir, enclos): object 'splits_orders' not found
  
nested_splits %>%
  head() %>% 
  gt()
#> Error in eval(expr, envir, enclos): object 'nested_splits' not found

routes <- 
nested_splits %>%
  # head() %>% 
  mutate(event = map(splits, \(df) left_join(df, common_splits, by = "supermetroid_label")),
         route_anatomecha = map(event, \(df) df %>% filter(!is.na(split_anatomecha)) %>%  pull(split_anatomecha)),
         route_standard = map_lgl(route_anatomecha,
                                  \(x) all(sort(x) == x))
         )
#> Error in eval(expr, envir, enclos): object 'nested_splits' not found

time vs cumulative time by splits

# dashboard this for y axis
# dashboard this for x axis, too selector tool with tick boxes, select all, clear

# plot <- 
  sio_df %>% 
  left_join(common_splits, by = "supermetroid_label") %>% 
  filter(!is.na(supermetroid_label)) %>% 
  mutate(split_anatomecha = as_factor(split_anatomecha),
         t_h = realtime_end_ms/1000/60
         ) %>% 
  ggplot(aes(x = split_anatomecha, y = t_h, group=run_id)) +
  geom_line(colour=sm_cols$orange, alpha = 0.2) +
  geom_point(colour=sm_cols$orange, alpha = 0.2) +
  theme_sm(base_size = 30) +
  labs(
    title = "Players take different routes",
    subtitle = "Real time completion of each game event would be 
      always increasing if all players took the same route as anatomecha" %>% str_wrap(60),
    x = "Game event (loot, boss, etc.)" %>% str_wrap(12),
    y = "Real time completion (minutes)" %>% str_wrap(20),
    caption = "splits.io"
  ) +
  theme(
    axis.text.x = element_text(angle=90)
  ) +
  scale_x_discrete(labels=c("start", game_event)) +
  facet_grid(route_standard ~. ) 
#> Error in `left_join()`:
#> ! Join columns in `x` must be present in the data.
#> ✖ Problem with `supermetroid_label`.

# ggplotly(plot)

# selected_split <- "ice beam" 
selected_split <- "ice beam"# sample(game_event, 1)
rank <- c(1,25)

#xlim <- c(0.8, 1.2)

# +
#   xlim(xlim[1], xlim[2])

Towards the best route

identifying clusters

Try knn

Wide-form array: - columns as events - rows as routes - values as mean event time

I think this should work on individual observations.

Get an array of segments
Set an n-fold cross validation
Evaluated the accuracy of the KNn classifer with different values of k by cross validation

# get an array of segments

identifying clusters

Try taking the mean value of each event of each route, then imputing the missing values from local means. How to impute via local mean? That is, need to impute, conditional on the other values in the array. Need to read about imputation. Perhaps I should just write my own imputation, I think I could calculate that. Then apply hierarchical clustering to find out which.

# create wideform array
routes_wide <- 
  routes_raw %>% 
    rename(game_event = supermetroid_label) %>% 
    group_by(run_id) %>% 
    filter(realtime_end_ms == max(realtime_end_ms),
           game_event %in% game_event
           ) %>% 
    ungroup() %>% 
    distinct() %>% 
    # dplyr::group_by(run_id, game_event) %>%
    # dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
    # dplyr::filter(n > 1L) 
    pivot_wider(id_cols = "run_id", 
                names_from = "game_event", 
                values_from = "realtime_end_ms") 
#> Error in eval(expr, envir, enclos): object 'routes_raw' not found

library(caret)
#> Loading required package: lattice
#> 
#> Attaching package: 'caret'
#> The following object is masked from 'package:purrr':
#> 
#>     lift

trControl <- trainControl(method  = "cv",
                          number  = 5)


fit <- train(supermetroid_label ~ realtime_end_ms,
             method     = "knn",
             tuneGrid   = expand.grid(k = 1:10),
             trControl  = trControl,
             metric     = "Accuracy",
             data       = routes_raw)
#> Error in eval(expr, p): object 'routes_raw' not found

routes_wide <- 
routes %>% 
  select(route_id, runs) %>% 
  unnest(runs) %>% 
  left_join(sio_df %>% 
              filter(split %in% game_event) %>% 
              select(run_id,
                     realtime_end_ms, split) %>% 
              group_by(run_id, split) %>% 
              filter(realtime_end_ms==max(realtime_end_ms)), 
            by = "run_id") %>% 
  group_by(route_id, run_id) %>% 
  # 0 entries must be corrupt?
  filter(realtime_end_ms != 0) %>% 
  pivot_wider(names_from = "split", values_from = "realtime_end_ms") %>% 
  janitor::clean_names()
#> Error in eval(expr, envir, enclos): object 'routes' not found

sio_routes_wide <- routes_wide

usethis::use_data(sio_routes_wide)





library(caret)

preProcValues <-
  preProcess(routes_wide %>% ungroup() %>% select(-c(run_id, route_id)))
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found

trControl <- trainControl(method  = "cv",
                          number  = 5)

# routes_wide %>% names() %>% paste(collapse ="+") %>%  cat()
fit <-
  train(
    route_id ~ morph_ball + bombs +
      charge_beam + varia_suit + speed_booster +
      wave_beam + grapple_beam + ice_beam + gravity_suit +
      space_jump + spring_ball +
      plasma_beam + screw_attack + spazer +
      hi_jump_boots + phantoon + draygon +
      ridley + kraid + x_ray + mother_brain + escape,
    method     = "knn",
    tuneGrid   = expand.grid(k = 1:10),
    trControl  = trControl,
    metric     = "Accuracy",
    data       = routes_wide
  )
#> Error in eval(expr, p): object 'routes_wide' not found


predict(preProcValues)
#> Error in eval(expr, envir, enclos): object 'preProcValues' not found

ice_beam <- 
routes_wide %>% 
  select(route_id, route_id, ice_beam) %>% 
  group_by(route_id) %>% 
  summarise(
    ice_beam = mean(ice_beam)
  ) %>% 
  ungroup() %>% 
  filter(!is.na(ice_beam)) 
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found

clusters <- dist(ice_beam) %>% hclust()
#> Error in eval(expr, envir, enclos): object 'ice_beam' not found

plot(clusters, labels = ice_beam$route_id)
#> Error in eval(expr, envir, enclos): object 'clusters' not found

library(ggdendro)

ggdendrogram(clusters)
#> Error in eval(expr, envir, enclos): object 'clusters' not found


# fuckit write csv
write_csv(routes_wide, "dev/routes_wide.csv")

import numpy as np
from sklearn.impute import KNNImputer

routes_wide = pd.read_csv("dev/routes_wide.csv")
#> Error: NameError: name 'pd' is not defined
imputer = KNNImputer()

df_knn_imputed = pd.DataFrame(imputer.fit_transform(routes_wide.drop(columns = ['run_id', 'route_id'])))
#> Error: NameError: name 'pd' is not defined

# set up df of imputed data in R
x_event = py$df_knn_imputed
#> Error in eval(expr, envir, enclos): object 'py' not found

colnames(x_event) <- routes_wide %>% ungroup() %>%  select(-c(route_id, run_id)) %>% colnames()
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found

routes_imputed = 
  routes_wide %>% select(route_id, run_id) %>% bind_cols(x_event)
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found


# try using caret to do knn classification

routes_imputed %>% View()
#> Error in eval(expr, envir, enclos): object 'routes_imputed' not found

routes_scaled <- 
  routes_imputed %>%  
    ungroup() %>% 
    select(-c(route_id, run_id)) %>% 
    mutate(across(everything(), scale)) %>% 
  bind_cols(routes_imputed %>% select(route_id, run_id)) %>% 
  select(route_id, run_id, everything())
#> Error in eval(expr, envir, enclos): object 'routes_imputed' not found

# https://rpubs.com/pmtam/knn

# Run algorithms using 10-fold cross validation
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"

grid <- expand.grid(.k=seq(1,20,by=1))

fit.knn <- train(route_id  # ~ ice_beam,
                 ~ # run_id 
                   + morph_ball + bombs +
      charge_beam + varia_suit + speed_booster +
      wave_beam + grapple_beam + ice_beam + gravity_suit +
      space_jump + spring_ball +
      plasma_beam + screw_attack + spazer +
      hi_jump_boots + phantoon + draygon +
      ridley + kraid + x_ray + mother_brain + escape, 
      data=routes_scaled %>% select(-run_id), method="knn",
                 metric=metric ,trControl=trainControl, tuneGrid=grid)
#> Error in eval(expr, p): object 'routes_scaled' not found


knn.k1 <- fit.knn$bestTune # keep this Initial k for testing with knn() function in next section
#> Error in eval(expr, envir, enclos): object 'fit.knn' not found
print(fit.knn)
#> Error in eval(expr, envir, enclos): object 'fit.knn' not found