Skip to contents

This document updates splitsio df for the analysis

Get segment labels

Splits, or segments, measure at what time a game event happens, and how long a player took to get there from their previous segment.

Unique split id?

Although segments are chosen by a program, so far we’ve only found the player labels for each split or segment. Players can define their own segments and labels, too, so we don’t want to confuse those points with others.

On the other hand we don’t want to keep bomb and throw out all the bombs with the bathwater.

labels_dat <- 
  # pull columns for this analysis from splits data
  sio_splits_df %>% 
  select(player_name, game_event, name, segment_number, run_id, player_id) %>% 
  mutate(
    # create a lower-case label 
    label = tolower(game_event)
    )
labels_dat %>% head()
#>   player_name           game_event                 name segment_number run_id
#> 0  anatomecha        Morphing Ball        Morphing Ball              0   ato1
#> 1  anatomecha       First Missiles       First Missiles              1   ato1
#> 2  anatomecha                 Bomb                 Bomb              2   ato1
#> 3  anatomecha First Super Missiles First Super Missiles              3   ato1
#> 4  anatomecha          Charge Beam          Charge Beam              4   ato1
#> 5  anatomecha               Spazer               Spazer              5   ato1
#>   player_id                label
#> 0     89646        morphing ball
#> 1     89646       first missiles
#> 2     89646                 bomb
#> 3     89646 first super missiles
#> 4     89646          charge beam
#> 5     89646               spazer

Most common labels

label_counts <- 
labels_dat %>% 
  mutate(
    starts_with = str_sub(label, 1, 1)
  ) %>% 
  count(starts_with, label) %>% 
  arrange(starts_with, desc(n))
label_counts %>% 
  ungroup() %>% 
  select(-starts_with) %>%  
  arrange(desc(n)) %>%
  filter(n > 50)
#>                   label   n
#> 1            space jump 411
#> 2          screw attack 360
#> 3                 bombs 339
#> 4          gravity suit 326
#> 5              phantoon 294
#> 6           plasma beam 271
#> 7         speed booster 262
#> 8            varia suit 259
#> 9           spring ball 257
#> 10            wave beam 252
#> 11             ice beam 248
#> 12          charge beam 231
#> 13         grapple beam 207
#> 14               spazer 203
#> 15                x-ray 203
#> 16             golden 4 154
#> 17                varia 154
#> 18           morph ball 153
#> 19        ridley e-tank 143
#> 20               ridley 142
#> 21                   g4 119
#> 22      ridley's e-tank 115
#> 23          x-ray scope 115
#> 24              botwoon 113
#> 25        turtle e-tank 113
#> 26              grapple 112
#> 27          golden four 105
#> 28              gravity 105
#> 29              escape!  96
#> 30                kraid  94
#> 31         mother brain  93
#> 32              draygon  91
#> 33               escape  91
#> 34       kraid missiles  84
#> 35        hi-jump boots  82
#> 36            crocomire  72
#> 37       grappling beam  72
#> 38          bomb torizo  71
#> 39            ss supers  71
#> 40             any% pbs  70
#> 41               plasma  70
#> 42          crab supers  66
#> 43                  ice  63
#> 44                boots  62
#> 45         early supers  59
#> 46 watering hole supers  59
#> 47        golden torizo  57
#> 48  green hill missiles  57
#> 49         kraid e-tank  56
#> 50       etecoon supers  53

# write labels to sheet for labelling 
# https://docs.google.com/spreadsheets/d/1FRHsS7WQvVhI5_yP-twHCJW2YRJRbeKg4P42EY3Jzrk/edit#gid=1079532092


write_csv(label_counts, "data-raw/sio-label-counts.csv")

relabels <- googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1FRHsS7WQvVhI5_yP-twHCJW2YRJRbeKg4P42EY3Jzrk/edit#gid=1079532092")

write_csv(relabels, "data-raw/sio-relabels.csv")

Add labels

relabels <- read_csv("../data-raw/sio-relabels.csv") %>% 
  select(label, supermetroid_label) %>% 
  distinct()
#> Rows: 2563 Columns: 4
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (3): starts_with, label, supermetroid_label
#> dbl (1): n
#> 
#>  Use `spec()` to retrieve the full column specification for this data.
#>  Specify the column types or set `show_col_types = FALSE` to quiet this message.

sio_relabelled <- 
  sio_splits_df %>% 
    mutate(
      label = tolower(game_event)
    ) %>% 
    left_join(relabels, by = "label") %>% 
    # replace label values
    mutate(label = if_else(!is.na(supermetroid_label), supermetroid_label, label)) %>% 
    ungroup() %>% 
    select(-game_event, -name) %>% 
    select(contains("label"), player_name, contains("realtime"), everything())

# there now, that's looking better
head(sio_relabelled)
#>                  label supermetroid_label player_name realtime_start_ms
#> 1           morph ball         morph ball  anatomecha                 0
#> 2       first missiles               <NA>  anatomecha            200835
#> 3                bombs              bombs  anatomecha            224926
#> 4 first super missiles               <NA>  anatomecha            358321
#> 5          charge beam        charge beam  anatomecha            589662
#> 6               spazer             spazer  anatomecha            709962
#>   realtime_duration_ms realtime_end_ms realtime_shortest_duration_ms
#> 1               200835          200835                        200835
#> 2                24091          224926                         23286
#> 3               133395          358321                        129371
#> 4               231341          589662                        227335
#> 5               120300          709962                        120299
#> 6                97876          807838                         95884
#>   realtime_gold segment_number                           segment_id run_id
#> 1          TRUE              0 69d90da3-1634-4af4-9096-419a32bedd0a   ato1
#> 2         FALSE              1 fad50319-d55b-460e-923a-c416f233daea   ato1
#> 3         FALSE              2 089daf0b-64c3-4eec-8902-022335b2b179   ato1
#> 4         FALSE              3 2a215dcd-2989-4939-8558-a7215904dfd1   ato1
#> 5         FALSE              4 e906991d-e919-47c1-a809-2b4da9b67461   ato1
#> 6         FALSE              5 10439232-43c7-4752-afb9-ee23782da4a9   ato1
#>   player_id
#> 1     89646
#> 2     89646
#> 3     89646
#> 4     89646
#> 5     89646
#> 6     89646
# check out counts now
sio_relabelled %>% 
  count(label, supermetroid_label) %>% 
  arrange(desc(n)) %>% 
  filter(n > 50) %>% 
  gt()
label supermetroid_label n
space jump space jump 450
varia suit varia suit 425
grapple beam grapple beam 405
screw attack screw attack 404
bombs bombs 399
x-ray x-ray 359
plasma beam plasma beam 342
gravity suit gravity suit 334
ice beam ice beam 324
phantoon phantoon 323
speed booster speed booster 313
spring ball spring ball 289
golden 4 golden 4 286
wave beam wave beam 283
charge beam charge beam 232
spazer spazer 218
hi-jump boots hi-jump boots 212
morph ball morph ball 208
ridley ridley 182
ridley e-tank NA 143
spore spawn supers spore spawn supers 136
draygon draygon 121
ridley's e-tank NA 115
kraid kraid 114
botwoon NA 113
turtle e-tank NA 113
golden four NA 105
gravity NA 105
crocomire crocomire 103
mother brain mother brain 99
escape! NA 96
first supers first supers 92
escape NA 91
kraid missiles NA 84
bomb torizo NA 71
any% pbs NA 70
crab supers NA 66
watering hole supers NA 59
golden torizo NA 57
green hill missiles NA 57
kraid e-tank NA 56
etecoon supers NA 53
sio_df <- sio_relabelled %>% select(-supermetroid_label) %>% 
  rename(game_event = label)

sio_df %>% head()
#>             game_event player_name realtime_start_ms realtime_duration_ms
#> 1           morph ball  anatomecha                 0               200835
#> 2       first missiles  anatomecha            200835                24091
#> 3                bombs  anatomecha            224926               133395
#> 4 first super missiles  anatomecha            358321               231341
#> 5          charge beam  anatomecha            589662               120300
#> 6               spazer  anatomecha            709962                97876
#>   realtime_end_ms realtime_shortest_duration_ms realtime_gold segment_number
#> 1          200835                        200835          TRUE              0
#> 2          224926                         23286         FALSE              1
#> 3          358321                        129371         FALSE              2
#> 4          589662                        227335         FALSE              3
#> 5          709962                        120299         FALSE              4
#> 6          807838                         95884         FALSE              5
#>                             segment_id run_id player_id
#> 1 69d90da3-1634-4af4-9096-419a32bedd0a   ato1     89646
#> 2 fad50319-d55b-460e-923a-c416f233daea   ato1     89646
#> 3 089daf0b-64c3-4eec-8902-022335b2b179   ato1     89646
#> 4 2a215dcd-2989-4939-8558-a7215904dfd1   ato1     89646
#> 5 e906991d-e919-47c1-a809-2b4da9b67461   ato1     89646
#> 6 10439232-43c7-4752-afb9-ee23782da4a9   ato1     89646
# not run
usethis::use_data(sio_df, overwrite=TRUE)

Selected splits

Select most common splits used by players.

sio_df %>%
  group_by(game_event) %>% 
  summarise(
    n_runners = n_distinct(player_id)
  ) %>% 
  filter(n_runners < 20) %>%  
  ggplot(aes(x=n_runners)) +
  geom_histogram(fill=sm_col_h$label) +
  theme_sm() +
  labs(title = "Counts of runners for splits")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

common_splits <-
  # get standard order - recorded by at least 30 runners
  sio_df %>%
  group_by(game_event) %>% 
  summarise(
    n_runners = n_distinct(player_id)
  ) %>% 
  filter(n_runners >= 30) %>% 
  arrange(desc(n_runners)) 

common_splits
#> # A tibble: 19 × 2
#>    game_event    n_runners
#>    <chr>             <int>
#>  1 varia suit           80
#>  2 bombs                75
#>  3 space jump           75
#>  4 grapple beam         72
#>  5 screw attack         68
#>  6 x-ray                68
#>  7 ice beam             63
#>  8 plasma beam          58
#>  9 speed booster        58
#> 10 phantoon             56
#> 11 gravity suit         55
#> 12 golden 4             54
#> 13 wave beam            49
#> 14 spring ball          45
#> 15 charge beam          37
#> 16 morph ball           37
#> 17 hi-jump boots        35
#> 18 spazer               33
#> 19 ridley               30
# not run
usethis::use_data(common_splits, overwrite=TRUE)

What routes do people take?

Oh this code is all borked and from early experimental phase from here. Waiting to copy-paste, etc.

routes_raw <- 
sio_df %>% 
  select(supermetroid_label, player_name, split_order = segment_number, realtime_end_ms, run_id, player_id) %>% 
  # get just the universal splits
  filter(!is.na(supermetroid_label))
#> Error in `select()`:
#> ! Can't subset columns that don't exist.
#>  Column `supermetroid_label` doesn't exist.
splits_orders <- 
routes_raw %>% 
  select(supermetroid_label, split_order, run_id)
#> Error in eval(expr, envir, enclos): object 'routes_raw' not found

nested_splits <- 
splits_orders %>% 
  arrange(run_id, split_order) %>% 
  select(-split_order) %>% 
  group_by(run_id) %>% 
  nest(splits = supermetroid_label) %>% 
  ungroup() %>% 
  group_by(splits) %>% 
  nest(runs = run_id) %>% 
  ungroup() %>% 
  mutate(
    route_length = map_int(splits, nrow),
    route_id = str_c("route ", 1:n())
  ) %>% 
  arrange(desc(route_length))
#> Error in eval(expr, envir, enclos): object 'splits_orders' not found
  
nested_splits %>%
  head() %>% 
  gt()
#> Error in eval(expr, envir, enclos): object 'nested_splits' not found
routes <- 
nested_splits %>%
  # head() %>% 
  mutate(event = map(splits, \(df) left_join(df, common_splits, by = "supermetroid_label")),
         route_anatomecha = map(event, \(df) df %>% filter(!is.na(split_anatomecha)) %>%  pull(split_anatomecha)),
         route_standard = map_lgl(route_anatomecha,
                                  \(x) all(sort(x) == x))
         )
#> Error in eval(expr, envir, enclos): object 'nested_splits' not found

time vs cumulative time by splits

# dashboard this for y axis
# dashboard this for x axis, too selector tool with tick boxes, select all, clear

# plot <- 
  sio_df %>% 
  left_join(common_splits, by = "supermetroid_label") %>% 
  filter(!is.na(supermetroid_label)) %>% 
  mutate(split_anatomecha = as_factor(split_anatomecha),
         t_h = realtime_end_ms/1000/60
         ) %>% 
  ggplot(aes(x = split_anatomecha, y = t_h, group=run_id)) +
  geom_line(colour=sm_cols$orange, alpha = 0.2) +
  geom_point(colour=sm_cols$orange, alpha = 0.2) +
  theme_sm(base_size = 30) +
  labs(
    title = "Players take different routes",
    subtitle = "Real time completion of each game event would be 
      always increasing if all players took the same route as anatomecha" %>% str_wrap(60),
    x = "Game event (loot, boss, etc.)" %>% str_wrap(12),
    y = "Real time completion (minutes)" %>% str_wrap(20),
    caption = "splits.io"
  ) +
  theme(
    axis.text.x = element_text(angle=90)
  ) +
  scale_x_discrete(labels=c("start", game_event)) +
  facet_grid(route_standard ~. ) 
#> Error in `left_join()`:
#> ! Join columns in `x` must be present in the data.
#>  Problem with `supermetroid_label`.

# ggplotly(plot)
# selected_split <- "ice beam" 
selected_split <- "ice beam"# sample(game_event, 1)
rank <- c(1,25)

#xlim <- c(0.8, 1.2)

# +
#   xlim(xlim[1], xlim[2])

Towards the best route

identifying clusters

Try knn

Wide-form array: - columns as events - rows as routes - values as mean event time

I think this should work on individual observations.

  1. Get an array of segments
  2. Set an n-fold cross validation
  3. Evaluated the accuracy of the KNn classifer with different values of k by cross validation
# get an array of segments

identifying clusters

Try taking the mean value of each event of each route, then imputing the missing values from local means. How to impute via local mean? That is, need to impute, conditional on the other values in the array. Need to read about imputation. Perhaps I should just write my own imputation, I think I could calculate that. Then apply hierarchical clustering to find out which.

# create wideform array
routes_wide <- 
  routes_raw %>% 
    rename(game_event = supermetroid_label) %>% 
    group_by(run_id) %>% 
    filter(realtime_end_ms == max(realtime_end_ms),
           game_event %in% game_event
           ) %>% 
    ungroup() %>% 
    distinct() %>% 
    # dplyr::group_by(run_id, game_event) %>%
    # dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
    # dplyr::filter(n > 1L) 
    pivot_wider(id_cols = "run_id", 
                names_from = "game_event", 
                values_from = "realtime_end_ms") 
#> Error in eval(expr, envir, enclos): object 'routes_raw' not found
library(caret)
#> Loading required package: lattice
#> 
#> Attaching package: 'caret'
#> The following object is masked from 'package:purrr':
#> 
#>     lift

trControl <- trainControl(method  = "cv",
                          number  = 5)


fit <- train(supermetroid_label ~ realtime_end_ms,
             method     = "knn",
             tuneGrid   = expand.grid(k = 1:10),
             trControl  = trControl,
             metric     = "Accuracy",
             data       = routes_raw)
#> Error in eval(expr, p): object 'routes_raw' not found
routes_wide <- 
routes %>% 
  select(route_id, runs) %>% 
  unnest(runs) %>% 
  left_join(sio_df %>% 
              filter(split %in% game_event) %>% 
              select(run_id,
                     realtime_end_ms, split) %>% 
              group_by(run_id, split) %>% 
              filter(realtime_end_ms==max(realtime_end_ms)), 
            by = "run_id") %>% 
  group_by(route_id, run_id) %>% 
  # 0 entries must be corrupt?
  filter(realtime_end_ms != 0) %>% 
  pivot_wider(names_from = "split", values_from = "realtime_end_ms") %>% 
  janitor::clean_names()
#> Error in eval(expr, envir, enclos): object 'routes' not found
sio_routes_wide <- routes_wide

usethis::use_data(sio_routes_wide)




library(caret)

preProcValues <-
  preProcess(routes_wide %>% ungroup() %>% select(-c(run_id, route_id)))
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found

trControl <- trainControl(method  = "cv",
                          number  = 5)

# routes_wide %>% names() %>% paste(collapse ="+") %>%  cat()
fit <-
  train(
    route_id ~ morph_ball + bombs +
      charge_beam + varia_suit + speed_booster +
      wave_beam + grapple_beam + ice_beam + gravity_suit +
      space_jump + spring_ball +
      plasma_beam + screw_attack + spazer +
      hi_jump_boots + phantoon + draygon +
      ridley + kraid + x_ray + mother_brain + escape,
    method     = "knn",
    tuneGrid   = expand.grid(k = 1:10),
    trControl  = trControl,
    metric     = "Accuracy",
    data       = routes_wide
  )
#> Error in eval(expr, p): object 'routes_wide' not found


predict(preProcValues)
#> Error in eval(expr, envir, enclos): object 'preProcValues' not found
ice_beam <- 
routes_wide %>% 
  select(route_id, route_id, ice_beam) %>% 
  group_by(route_id) %>% 
  summarise(
    ice_beam = mean(ice_beam)
  ) %>% 
  ungroup() %>% 
  filter(!is.na(ice_beam)) 
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found

clusters <- dist(ice_beam) %>% hclust()
#> Error in eval(expr, envir, enclos): object 'ice_beam' not found

plot(clusters, labels = ice_beam$route_id)
#> Error in eval(expr, envir, enclos): object 'clusters' not found

library(ggdendro)

ggdendrogram(clusters)
#> Error in eval(expr, envir, enclos): object 'clusters' not found

# fuckit write csv
write_csv(routes_wide, "dev/routes_wide.csv")
import numpy as np
from sklearn.impute import KNNImputer

routes_wide = pd.read_csv("dev/routes_wide.csv")
#> Error: NameError: name 'pd' is not defined
imputer = KNNImputer()

df_knn_imputed = pd.DataFrame(imputer.fit_transform(routes_wide.drop(columns = ['run_id', 'route_id'])))
#> Error: NameError: name 'pd' is not defined
# set up df of imputed data in R
x_event = py$df_knn_imputed
#> Error in eval(expr, envir, enclos): object 'py' not found

colnames(x_event) <- routes_wide %>% ungroup() %>%  select(-c(route_id, run_id)) %>% colnames()
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found

routes_imputed = 
  routes_wide %>% select(route_id, run_id) %>% bind_cols(x_event)
#> Error in eval(expr, envir, enclos): object 'routes_wide' not found

# try using caret to do knn classification

routes_imputed %>% View()
#> Error in eval(expr, envir, enclos): object 'routes_imputed' not found

routes_scaled <- 
  routes_imputed %>%  
    ungroup() %>% 
    select(-c(route_id, run_id)) %>% 
    mutate(across(everything(), scale)) %>% 
  bind_cols(routes_imputed %>% select(route_id, run_id)) %>% 
  select(route_id, run_id, everything())
#> Error in eval(expr, envir, enclos): object 'routes_imputed' not found

# https://rpubs.com/pmtam/knn

# Run algorithms using 10-fold cross validation
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "Accuracy"

grid <- expand.grid(.k=seq(1,20,by=1))

fit.knn <- train(route_id  # ~ ice_beam,
                 ~ # run_id 
                   + morph_ball + bombs +
      charge_beam + varia_suit + speed_booster +
      wave_beam + grapple_beam + ice_beam + gravity_suit +
      space_jump + spring_ball +
      plasma_beam + screw_attack + spazer +
      hi_jump_boots + phantoon + draygon +
      ridley + kraid + x_ray + mother_brain + escape, 
      data=routes_scaled %>% select(-run_id), method="knn",
                 metric=metric ,trControl=trainControl, tuneGrid=grid)
#> Error in eval(expr, p): object 'routes_scaled' not found


knn.k1 <- fit.knn$bestTune # keep this Initial k for testing with knn() function in next section
#> Error in eval(expr, envir, enclos): object 'fit.knn' not found
print(fit.knn)
#> Error in eval(expr, envir, enclos): object 'fit.knn' not found