Skip to contents
import pandas as pd
import splitsio
import json

# try to fix the issue in src, possibly ask Anthony

splits.io data

Load runs and runners

This module has its own types; here I convert to a list of dictionaries I can save locally so I don’t have to call the API every time I knit the html.

# this chunk is not evaluated 
# to minimise api calls

# get 100% Category Super Metroid game data  
sio_cat = splitsio.Category.from_id("279", historic=True)
type(sio_cat)
# extract runners from category  
sio_runners = sio_cat.runners()
type(sio_runners)
sio_runners[0]
sio_runners[-1]

# extract run from category
sio_runs = sio_cat.runs()
type(sio_runs)
sio_runs[0]
sio_runs[-1]


# convert the run objects to dictionaries
sio_run_dicts = [sio_runs[x].to_dict() for x in range(len(sio_runs))]

# each element is a dictionary with keys
sio_run_dicts[0].keys()


# run id
sio_run_dicts[0].keys()

# get id
sio_run_dicts[0]['id']

# get realtime
sio_run_dicts[0]['realtime_duration_ms']

print(sio_run_dicts)
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(sio_run_dicts[0:3])
sio_cat.__dict__
for key in sio_cat.keys():
  print(key.__dict__)

# save object locally
file_path = "data-raw/sio_cat.json"
with open(file_path, "w") as outfile:
  outfile.write(json.dumps(sio_run_dicts, indent=4))

Run dataframe

Objective: to wrangle a data frame with run data

sio_run_df description splitsio
run_id splits.io id id
date timestamp of run upload ? 'created_at', 'parsed_at', 'updated_at' we’ll use ‘updated_at’ for now, but should check splitsio docs for which
run_time total time of run in s or ms realtime_duration_ms
rank “historical” if from previous record (nb only applies to speedrun.com); otherwise rank as int, list column ?
src_player_id speedrun.com player id srdc_id

Try with list of dictionaries

with open("../data-raw/sio_cat.json", "r") as infile:
  sio_run_dicts = json.load(infile)
# player data
pd.json_normalize(sio_run_dicts[0]['runners'])['id'][0]
#> '29900'
sio_run_player_index = 10 # len(sio_run_dicts)

run_runners_list = [pd.json_normalize(sio_run_dicts[x]['runners']).assign(run_id = sio_run_dicts[x]['id']) for x in range(len(sio_run_dicts))]

run_runners_concat = pd.concat(run_runners_list)

run_runners = run_runners_concat[['name', 'run_id','id']].rename(columns={'id':'player_id', 'name':'player_name'})

sio_run_dicts[0].keys()
#> dict_keys(['id', 'srdc_id', 'realtime_duration_ms', 'realtime_sum_of_best_ms', 'gametime_duration_ms', 'gametime_sum_of_best_ms', 'default_timing', 'program', 'attempts', 'image_url', 'parsed_at', 'created_at', 'updated_at', 'video_url', 'game', 'category', 'runners', 'segments', 'histories'])
# create a dataframe

sio_runs_df = pd.DataFrame({
  'run_id' : [sio_run_dicts[x]['id'] for x in range(len(sio_run_dicts))],
  't_ms' : [sio_run_dicts[x]['realtime_duration_ms'] for x in range(len(sio_run_dicts))],
  'date' : [sio_run_dicts[x]['updated_at'] for x in range(len(sio_run_dicts))],
  'image_url': [sio_run_dicts[x]['image_url'] for x in range(len(sio_run_dicts))],
  'video_url': [sio_run_dicts[x]['video_url'] for x in range(len(sio_run_dicts))]
})


sio_runs_df.head()

#>   run_id     t_ms  ...                        image_url video_url
#> 0   awdz  4473112  ...  https://i.imgur.com/zSvYVbK.png      None
#> 1   awcd  6516886  ...                             None      None
#> 2   avqg  4965711  ...                             None      None
#> 3   avgw  6885726  ...                             None      None
#> 4   av92  4858865  ...                             None      None
#> 
#> [5 rows x 5 columns]
# need to merge with run runners
run_runners.head()
#>   player_name run_id player_id
#> 0    CScottyW   awdz     29900
#> 0  juniorr300   awcd     89051
#> 0        Hatz   avqg     91587
#> 0  anatomecha   avgw     89646
#> 0     eholden   av92     37661
run_runners_df = sio_runs_df.merge(
  run_runners, on = "run_id", how = 'left').reset_index(drop=True)

# run and runner data
run_runners_df.head()
#>   run_id     t_ms                      date  ... video_url player_name player_id
#> 0   awdz  4473112  2023-06-27T04:24:02.434Z  ...      None    CScottyW     29900
#> 1   awcd  6516886  2023-06-26T20:12:45.152Z  ...      None  juniorr300     89051
#> 2   avqg  4965711  2023-06-21T20:55:31.169Z  ...      None        Hatz     91587
#> 3   avgw  6885726  2023-06-19T19:13:58.050Z  ...      None  anatomecha     89646
#> 4   av92  4858865  2023-06-17T23:19:14.463Z  ...      None     eholden     37661
#> 
#> [5 rows x 7 columns]

create segments dataframe

segment_df description
run_id unique identifier of run
segment_id unique identifier of segment
game_event description of split
t_s time in seconds, measured to millisecond precision
# first level of segments
type(sio_run_dicts[0]['segments'])
#> <class 'list'>
type(sio_run_dicts[0]['segments'][0])

# inspect a segment
#> <class 'dict'>
sio_run_dicts[0]['segments']

# a single segment
#> [{'id': 'ff4b6370-f7aa-4d46-a9d2-90b1c286b050', 'name': 'Taco Tank', 'display_name': 'Taco Tank', 'segment_number': 0, 'realtime_start_ms': 0, 'realtime_duration_ms': 285053, 'realtime_end_ms': 285053, 'realtime_shortest_duration_ms': 282335, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': 'e34dcf39-1394-4152-a11e-ee1b77e94754', 'name': 'X-Ray', 'display_name': 'X-Ray', 'segment_number': 1, 'realtime_start_ms': 285053, 'realtime_duration_ms': 362295, 'realtime_end_ms': 647348, 'realtime_shortest_duration_ms': 356183, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': '2248d379-b4b8-45d7-b7ae-fba99580035e', 'name': 'Grapple', 'display_name': 'Grapple', 'segment_number': 2, 'realtime_start_ms': 647348, 'realtime_duration_ms': 346896, 'realtime_end_ms': 994244, 'realtime_shortest_duration_ms': 341495, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': '63fdf981-7762-4f42-998e-af5c81fad0de', 'name': 'Phan (f/m)', 'display_name': 'Phan (f/m)', 'segment_number': 3, 'realtime_start_ms': 994244, 'realtime_duration_ms': 298478, 'realtime_end_ms': 1292722, 'realtime_shortest_duration_ms': 289674, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': 'd31efcb8-a420-4d7f-aa11-4688b4a6741f', 'name': '100 Fishing Quests', 'display_name': '100 Fishing Quests', 'segment_number': 4, 'realtime_start_ms': 1292722, 'realtime_duration_ms': 316337, 'realtime_end_ms': 1609059, 'realtime_shortest_duration_ms': 316279, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': '3202c9ab-05f6-49fa-9e85-5accf5bd2dfd', 'name': 'Exit Tourian', 'display_name': 'Exit Tourian', 'segment_number': 5, 'realtime_start_ms': 1609059, 'realtime_duration_ms': 392693, 'realtime_end_ms': 2001752, 'realtime_shortest_duration_ms': 375931, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': '28c363db-ad09-47b5-9dc6-fa12f96eb699', 'name': 'Space Jump', 'display_name': 'Space Jump', 'segment_number': 6, 'realtime_start_ms': 2001752, 'realtime_duration_ms': 407696, 'realtime_end_ms': 2409448, 'realtime_shortest_duration_ms': 394762, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': 'ddf8222e-414d-4aa9-9b3d-e5d1954765d6', 'name': 'Spring Ball', 'display_name': 'Spring Ball', 'segment_number': 7, 'realtime_start_ms': 2409448, 'realtime_duration_ms': 246277, 'realtime_end_ms': 2655725, 'realtime_shortest_duration_ms': 245998, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': '9a98d829-f21c-43db-9b5a-5b455428c653', 'name': 'Ice', 'display_name': 'Ice', 'segment_number': 8, 'realtime_start_ms': 2655725, 'realtime_duration_ms': 392564, 'realtime_end_ms': 3048289, 'realtime_shortest_duration_ms': 389927, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': 'a05ee907-b9d1-407f-9466-59fc48c123bf', 'name': 'The Riddler', 'display_name': 'The Riddler', 'segment_number': 9, 'realtime_start_ms': 3048289, 'realtime_duration_ms': 197030, 'realtime_end_ms': 3245319, 'realtime_shortest_duration_ms': 182305, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': '1f4d582d-7713-4476-bdc9-cfd717f7eddc', 'name': 'Grapple (Reprise)', 'display_name': 'Grapple (Reprise)', 'segment_number': 10, 'realtime_start_ms': 3245319, 'realtime_duration_ms': 387838, 'realtime_end_ms': 3633157, 'realtime_shortest_duration_ms': 383449, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': '80f0e660-0058-4b06-9d62-e75ec9a57b08', 'name': 'Make a Bet', 'display_name': 'Make a Bet', 'segment_number': 11, 'realtime_start_ms': 3633157, 'realtime_duration_ms': 433039, 'realtime_end_ms': 4066196, 'realtime_shortest_duration_ms': 424170, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}, {'id': '8139d4e1-2d1b-4237-a528-440ba5c93002', 'name': 'Michael', 'display_name': 'Michael', 'segment_number': 12, 'realtime_start_ms': 4066196, 'realtime_duration_ms': 406916, 'realtime_end_ms': 4473112, 'realtime_shortest_duration_ms': 396203, 'realtime_gold': False, 'realtime_skipped': False, 'realtime_reduced': False, 'gametime_start_ms': 0, 'gametime_duration_ms': 0, 'gametime_end_ms': 0, 'gametime_shortest_duration_ms': None, 'gametime_gold': False, 'gametime_skipped': True, 'gametime_reduced': False, 'histories': None}]
sio_run_dicts[0]['segments'][0].keys()

# convert to df
# I think this is a dictionary of single-element entries, so can be flattened
# how to efficiently check this?
#> dict_keys(['id', 'name', 'display_name', 'segment_number', 'realtime_start_ms', 'realtime_duration_ms', 'realtime_end_ms', 'realtime_shortest_duration_ms', 'realtime_gold', 'realtime_skipped', 'realtime_reduced', 'gametime_start_ms', 'gametime_duration_ms', 'gametime_end_ms', 'gametime_shortest_duration_ms', 'gametime_gold', 'gametime_skipped', 'gametime_reduced', 'histories'])
pd.DataFrame(sio_run_dicts[0]['segments']).head()

# List of dataframes of segments for each run
#>                                      id  ... histories
#> 0  ff4b6370-f7aa-4d46-a9d2-90b1c286b050  ...      None
#> 1  e34dcf39-1394-4152-a11e-ee1b77e94754  ...      None
#> 2  2248d379-b4b8-45d7-b7ae-fba99580035e  ...      None
#> 3  63fdf981-7762-4f42-998e-af5c81fad0de  ...      None
#> 4  d31efcb8-a420-4d7f-aa11-4688b4a6741f  ...      None
#> 
#> [5 rows x 19 columns]
segments_list = [pd.DataFrame(sio_run_dicts[x]['segments']).assign(run_id = sio_run_dicts[x]['id']) for x in range(len(sio_run_dicts))]

segment_raw = pd.concat(segments_list).reset_index(drop=True)

segment_raw.columns
#> Index(['id', 'name', 'display_name', 'segment_number', 'realtime_start_ms',
#>        'realtime_duration_ms', 'realtime_end_ms',
#>        'realtime_shortest_duration_ms', 'realtime_gold', 'realtime_skipped',
#>        'realtime_reduced', 'gametime_start_ms', 'gametime_duration_ms',
#>        'gametime_end_ms', 'gametime_shortest_duration_ms', 'gametime_gold',
#>        'gametime_skipped', 'gametime_reduced', 'histories', 'run_id'],
#>       dtype='object')
segment_raw.shape
#> (18458, 20)
segment_raw.head()

#>                                      id                name  ... histories  run_id
#> 0  ff4b6370-f7aa-4d46-a9d2-90b1c286b050           Taco Tank  ...      None    awdz
#> 1  e34dcf39-1394-4152-a11e-ee1b77e94754               X-Ray  ...      None    awdz
#> 2  2248d379-b4b8-45d7-b7ae-fba99580035e             Grapple  ...      None    awdz
#> 3  63fdf981-7762-4f42-998e-af5c81fad0de          Phan (f/m)  ...      None    awdz
#> 4  d31efcb8-a420-4d7f-aa11-4688b4a6741f  100 Fishing Quests  ...      None    awdz
#> 
#> [5 rows x 20 columns]

segment_raw.columns
#> Index(['id', 'name', 'display_name', 'segment_number', 'realtime_start_ms',
#>        'realtime_duration_ms', 'realtime_end_ms',
#>        'realtime_shortest_duration_ms', 'realtime_gold', 'realtime_skipped',
#>        'realtime_reduced', 'gametime_start_ms', 'gametime_duration_ms',
#>        'gametime_end_ms', 'gametime_shortest_duration_ms', 'gametime_gold',
#>        'gametime_skipped', 'gametime_reduced', 'histories', 'run_id'],
#>       dtype='object')
sio_runs_df.columns

# sio_segments_df = (segment_raw.rename(
#   {
#     'display_name': 'player_name'
# }
# ).merge(
#   sio_runs_df, on = "run_id", how="left"))
#> Index(['run_id', 't_ms', 'date', 'image_url', 'video_url'], dtype='object')
sio_segments_df.shape
#> Error: NameError: name 'sio_segments_df' is not defined
sio_segments_df.columns
#> Error: NameError: name 'sio_segments_df' is not defined
sio_segments_df.head()

#> Error: NameError: name 'sio_segments_df' is not defined
# tidy it up in R before writing it

sio_segments <- py$segment_raw
sio_runs <- py$run_runners

sio_seg_runs <- left_join(sio_segments %>% distinct(), 
                          sio_runs %>% distinct() %>% 
                            rename(total_t_ms = t_ms) %>% 
                            select(-video_url, -image_url, -date)
                            , by = "run_id")

colnames(sio_seg_runs)

sio_seg_runs %>% 
  count(run_id, id) %>% 
  arrange(desc(n)) %>% 
  filter(n > 1) 

sio_splits_df <- 
  sio_segments %>% 
    select(
      player_name = name, 
      game_event = display_name, 
      segment_number, 
      realtime_start_ms, 
      segment_id = id,
      run_id,
      player_id,
      everything()) %>% 
    select(-contains("gametime"),
           -contains("reduced"),
           -contains("skipped"), -histories) %>% 
    mutate(player_id = as.character(player_id)))

head(sio_splits_df)
#> Error: <text>:33:48: unexpected ')'
#> 32:            -contains("skipped"), -histories) %>% 
#> 33:     mutate(player_id = as.character(player_id)))
#>                                                    ^
# write data - this chunk is not evaluated on knit
usethis::use_data(sio_splits_df, overwrite = TRUE)