src • supermetroid

# most chunks in this vignette are not evaluated because I'm not sure how to
# save nested dictionaries yet, and calling the api borks in knitting but
# not in console.

# load R analysis package
library(supermetroid)

# other tools
library(tidyverse)
# so R can talk to Python
library(reticulate)
# for html tables
library(gt)

speedrun.com

Speedrun.com is (according to the the two speed runners I’ve spoken to) the canonical leaderboard for speed runners on the internet. Because it is believed to be the most complete set (I’d like to check this with data) of speed runs, the canonical ranking of players amongst the community is their Speedrun.com ranking.

data(src_df)

all_run_raincloud(src_run_df)

The code below is how these data were extracted and cleaned for the visualisation above.

src_df %>% head() %>% gt()

rank	player_name	date	t_human	country	run_id	t_s	src_user_guest	uri_player	api_call
1	ShinyZeni	2023-06-20	PT1H12M47S	United States	ywrjqd9m	4367.000	user	https://www.speedrun.com/api/v1/users/zxzno3ex	zxzno3ex
2	zoast	2021-02-24	PT1H12M55S	Palau	yo75d4dm	4375.000	user	https://www.speedrun.com/api/v1/users/18v6k4nx	18v6k4nx
3	Behemoth87	2021-12-08	PT1H12M55.930S	England	m36d0q6m	4375.930	user	https://www.speedrun.com/api/v1/users/zxz2wy4x	zxz2wy4x
4	Gebbu	2023-05-08	PT1H13M12S	Norway	m3qo724y	4392.000	user	https://www.speedrun.com/api/v1/users/xk49m26j	xk49m26j
5	Static_Shock	2023-06-10	PT1H13M58.367S	Brazil	m3qrx76y	4438.367	user	https://www.speedrun.com/api/v1/users/x35ve3kj	x35ve3kj
6	CScottyW	2023-06-27	PT1H14M18S	United States	zpw9px8y	4458.000	user	https://www.speedrun.com/api/v1/users/18q3n0dj	18q3n0dj

api

There is a Python module for accessing speedrun.com .

# load Python modules 
import srcomapi, srcomapi.datatypes as dt
import pandas as pd # for wrangling into df
import time
import json
import math

# Tak, Anthony :)
import pprint
pp = pprint.PrettyPrinter(indent=4)

Get data

# this chunk is only evaluated when data is updated

# this chunk code copied from
# https://github.com/blha303/srcomapi

# call api
src_api = srcomapi.SpeedrunCom(); src_api.debug = 1

# It's recommended to cache the game ID and use it for future requests.
# Data is cached for the current session by classname/id so future
# requests for the same game are instantaneous.

src_api.search(srcomapi.datatypes.Game, {"name": "super metroid"})

# can we add a historical == true to this?
game = _[0]

srcomapi_runs = {}
for category in game.categories:
  if not category.name in srcomapi_runs:
    srcomapi_runs[category.name] = {}
  if category.type == 'per-level':
    for level in game.levels:
      srcomapi_runs[category.name][level.name] = dt.Leaderboard(src_api, data=src_api.get("leaderboards/{}/level/{}/{}?embed=variables".format(game.id, level.id, category.id)))
  else:
    srcomapi_runs[category.name] = dt.Leaderboard(src_api, data=src_api.get("leaderboards/{}/category/{}?embed=variables".format(game.id, category.id)))


### 100% Super Metroid leaderboard
src_leaderboard = srcomapi_runs['100%']



# list of dictionaries
type(src_leaderboard)

# Only nested object seems to be runs
src_leaderboard.keys()

# Don't know what this is, is it useful?
# generator object DataType
src_leaderboard.variables 

type(src_leaderboard.runs)
type(src_leaderboard.runs[0])
src_leaderboard.runs[0]['run'].data

# convert the run objects to dictionaries
src_runs = [src_leaderboard.runs[x]['run'].data for x in range(len(src_leaderboard.runs))]

Get runs



# list of 
type(src_runs)
# dictionaries
#> Error: NameError: name 'src_runs' is not defined
type(src_runs[1])

# extracting specific values
#> Error: NameError: name 'src_runs' is not defined
src_runs[0].keys()

# rank
#> Error: NameError: name 'src_runs' is not defined
src_leaderboard.runs[0]['place']

# run as a nested dictionary
#> Error: NameError: name 'src_leaderboard' is not defined
src_runs[0].keys()

# Inspect bits we want to extract
#> Error: NameError: name 'src_runs' is not defined
src_runs[5]['id']
#> Error: NameError: name 'src_runs' is not defined
src_runs[5]['times']
#> Error: NameError: name 'src_runs' is not defined
src_runs[5]['date']
#> Error: NameError: name 'src_runs' is not defined
for x in range(len(src_runs)):
  print(src_runs[x]['players'][0])
#> Error: NameError: name 'src_runs' is not defined

Run dataframe

src_run_df	description	from
date	timestamp of run upload	`runs[index]['run']['date']`
run_id	unique identifier of run	`runs[index]['run']['date']`
run_time	total time of run in s or ms	`runs[index]['run']['times']['realtime_t']`
player_id	unique identifier of player	player id == run id, debug extraction
rank	rank, empty if historical, currently don’t have historical obs	`src_runs[x]['place']`


n_obs = len(src_runs)

# how to get splits? [src_runs][x]['run']['splits']

# get a list of players
#> Error: NameError: name 'src_runs' is not defined
src_players = [pd.DataFrame(
  src_runs[x]['players']) for x in range(len(src_runs))]

# check length of run-player is same
#> Error: NameError: name 'src_runs' is not defined
len(src_players)

# i think these are all single-depth dicts, and can be flattened
#> Error: NameError: name 'src_players' is not defined
pp.pprint(src_players[0:3])

# concatenate players into df
#> Error: NameError: name 'src_players' is not defined
src_player_df = pd.concat(src_players).rename(columns={
  'uri':'uri_player',
  'id':'src_id',
  'rel':'src_user_guest'
})

# create api call
#> Error: NameError: name 'src_players' is not defined
src_player_df["api_call"] = src_player_df[
  'src_id'].fillna(src_player_df['name'])

# ugh couldn't figure out how to lambda that ifelse
#> Error: NameError: name 'src_player_df' is not defined
src_player_df = src_player_df.drop(columns = ['src_id', 'name'])

# take a look
#> Error: NameError: name 'src_player_df' is not defined
src_player_df.head()

# check number of records should match runs
#> Error: NameError: name 'src_player_df' is not defined
src_player_df.shape[0]


# extract elements with list comprehension (new tool for me)
#> Error: NameError: name 'src_player_df' is not defined
src_run_df = pd.DataFrame({
  # need to inspect what happens to historical ranking
  'rank' : [src_leaderboard.runs[x]['place'] for x in range(n_obs)], 
  't_human' : [src_runs[x]['times']['primary'] 
    for x in range(n_obs)],
  'date' : [src_runs[x]['date'] for x in range(n_obs)],
  't_s' : [src_runs[x]['times']['realtime_t'] 
    for x in range(n_obs)],
  'run_id' : [src_runs[x]['id'] for x in range(n_obs)]
})

# check that run-players and runs have same length
#> Error: NameError: name 'n_obs' is not defined
src_player_df.shape[0] == src_run_df.shape[0]
#> Error: NameError: name 'src_player_df' is not defined
src_run_runners_df = pd.concat([src_run_df.reset_index(drop = True), src_player_df.reset_index(drop = True)], axis=1)

# check we still have the same number of runs
# should be one row per run
#> Error: NameError: name 'src_run_df' is not defined
len(src_leaderboard.runs) == src_run_runners_df.shape[0]

# inspect runs dataframe
#> Error: NameError: name 'src_leaderboard' is not defined
src_run_runners_df.head()
#> Error: NameError: name 'src_run_runners_df' is not defined
src_run_runners_df.shape
#> Error: NameError: name 'src_run_runners_df' is not defined
src_run_runners_df.columns
#> Error: NameError: name 'src_run_runners_df' is not defined

Player data

layer_df	description
player_id	unique identifier of player
player_handle	human-readable unique tag
location	geographic location of player

# this loop fails because not all ids are valid


# get a list of player records for each valid player id
src_location = []
src_name = []

player = src_run_runners_df.api_call[7]

# well this turned into something horrible, despite my best efforts
# but I think it works
#> Error: NameError: name 'src_run_runners_df' is not defined
for player in list(src_run_runners_df.api_call):
  print("loop index")
  print(len(src_location))
  print(len(src_name))
  print(player)
  try:
    user = src_api.get_user(player)
    src_name.append(user.name)
    try: 
      src_location.append(user.location['country']['names']['international'])
    except: 
      src_location.append("no record")
  except:
    try: 
      src_api.search(srcomapi.datatypes.Player, {'name': player})
      api_player = _[0]
      src_location.append(api_player.location)
      src_name.append(player)
    except:
      src_location.append("no record")
      src_name.append(player)
  time.sleep(0.5)
#> Error: NameError: name 'src_run_runners_df' is not defined
src_run_runners_df.shape[0]  
#> Error: NameError: name 'src_run_runners_df' is not defined
len(src_location)
#> 0
len(src_name)
#> 0
print(src_location[0:9])
#> []
print(src_name[0:9])

# do we have the correct number of players? (repeats expected, 1/run)

#> []

src_raw_df = src_run_runners_df.assign(
  player_name = src_name,
  country = src_location
)
#> Error: NameError: name 'src_run_runners_df' is not defined
src_raw_df.shape
#> Error: NameError: name 'src_raw_df' is not defined
src_raw_df.tail()
#> Error: NameError: name 'src_raw_df' is not defined
src_raw_df.head()
#> Error: NameError: name 'src_raw_df' is not defined

# bring it over into sweet, sweet easier R
src_all_obs_df <- py$src_raw_df
#> Error in eval(expr, envir, enclos): AttributeError: module '__main__' has no attribute 'src_raw_df'

src_df <-
  src_all_obs_df %>% 
  select(rank, 
         player_name, 
         date, 
         t_human, 
         country, 
         contains("id"),
         everything()) 
#> Error in eval(expr, envir, enclos): object 'src_all_obs_df' not found

src_df %>% head()
#>   rank  player_name       date        t_human       country   run_id      t_s
#> 1    1    ShinyZeni 2023-06-20     PT1H12M47S United States ywrjqd9m 4367.000
#> 2    2        zoast 2021-02-24     PT1H12M55S         Palau yo75d4dm 4375.000
#> 3    3   Behemoth87 2021-12-08 PT1H12M55.930S       England m36d0q6m 4375.930
#> 4    4        Gebbu 2023-05-08     PT1H13M12S        Norway m3qo724y 4392.000
#> 5    5 Static_Shock 2023-06-10 PT1H13M58.367S        Brazil m3qrx76y 4438.367
#> 6    6     CScottyW 2023-06-27     PT1H14M18S United States zpw9px8y 4458.000
#>   src_user_guest                                     uri_player api_call
#> 1           user https://www.speedrun.com/api/v1/users/zxzno3ex zxzno3ex
#> 2           user https://www.speedrun.com/api/v1/users/18v6k4nx 18v6k4nx
#> 3           user https://www.speedrun.com/api/v1/users/zxz2wy4x zxz2wy4x
#> 4           user https://www.speedrun.com/api/v1/users/xk49m26j xk49m26j
#> 5           user https://www.speedrun.com/api/v1/users/x35ve3kj x35ve3kj
#> 6           user https://www.speedrun.com/api/v1/users/18q3n0dj 18q3n0dj

usethis::use_data(src_df, overwrite=TRUE)

Visualise runs

Now we have the data from speedrun.com leaderboard, we can plot the distribution of runs.

# get the data from python into R

all_run_raincloud(src_df_all_obs)
#> Error in eval(expr, envir, enclos): object 'src_df_all_obs' not found

We have a handful of 0 entries and some > 3 hours.

# how many runs are really low?
src_df_all_obs %>% 
  filter(t_s < 4000 | t_s > 3 * 60 * 60)
#> Error in eval(expr, envir, enclos): object 'src_df_all_obs' not found

The 0 entries are run times where gametime was captured, but realtime was not by the tool the player used to record the run. Our analyses are on realtime, so we will exclude these observations.

We are also interest in comparing speed runners, as opposed to those logging playing through the game, which takes some hours.

(prop_greater_3hrs <- sum(src_df_all_obs$t_s > 3 * 60 * 60) / 
   nrow(src_df_all_obs))
#> Error in eval(expr, envir, enclos): object 'src_df_all_obs' not found

Since only {r round(prop_greater_3hrs, 2) * 100}% of runs are greater than 3 hours, these are negligible, and arguably not speed runs. We will define a Super Metroid speed run, for this analysis, to be a Super Metroid 100% run that takes under 3 hours.

Raincloud

src_df_all_obs %>%
  filter(
    # remove 0 length runs
    t_s > 0, 
    # exclude runs > 3 hours
    t_s < 3 * 60 * 60) %>%  
  all_run_raincloud()
#> Error in eval(expr, envir, enclos): object 'src_df_all_obs' not found

Write run data from speedrun.com to supermetroid

# this chunk is evaluated when data is updated
src_df <- 
  src_df_all_obs %>% 
  # filter runs to less than 3 hours, and weird 0 length records
  # we won't consider these speed runs
  filter(t_s > 0, t_s < 3 * 60 * 60) %>% 
  mutate(
    rank = as.integer(rank),
    player_name = as.character(player_name),
    date = ymd(date),
    country = as.character(location)
  )

head(src_df)