Skip to main content

Parsing SC2 replays for later analysis

I've realized I owe you an explanation on how to parse your own SC2 replays for the series of posts on Bayesian SC2 replay data analysis. Let's go through it here!

We'll use ZephyrBlu's zephyrus-sc2-parser library, which you can download via pip install zephyrus-sc2-parser.

Parsing the replays

This process currently dumps a boatload of warnings and exceptions, so I'm choosing to wrap the former in a try-except and simply ignore those, and ignore the thrown warnings with warnings.simplefilter("ignore"). Feel free to disable them on your end; but don't say I didn't warn you!

Note that this process takes a while, and we'll have to do some wrangling later on, so it makes more sense to parse all the replays first and have them all in memory for later. It might fail for larger datasets.

In [1]:
import pathlib
import warnings

import tqdm.auto as tqdm
import zephyrus_sc2_parser

REPLAY_DIRECTORY = "/home/dominik/Links/SC2Reps"
PLAYER_NAME = "Perfi"

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    replays = list(pathlib.Path(REPLAY_DIRECTORY).glob("*.SC2Replay"))
    parsed_replays = {}
    for replay_file in tqdm.tqdm(replays):
        try:
            replay = zephyrus_sc2_parser.parse_replay(replay_file, local=True)
        except Exception as e:
            print(f"Failed for {replay_file}: {e}")
            continue
        parsed_replays[replay_file] = replay
HBox(children=(FloatProgress(value=0.0, max=506.0), HTML(value='')))
Failed for /home/dominik/Links/SC2Reps/Ephemeron LE (27).SC2Replay: 'NoneType' object has no attribute 'race'
Failed for /home/dominik/Links/SC2Reps/Triton LE (4).SC2Replay: local variable 'game_length' referenced before assignment
Failed for /home/dominik/Links/SC2Reps/Winter's Gate LE (4).SC2Replay: local variable 'game_length' referenced before assignment
Failed for /home/dominik/Links/SC2Reps/Zen LE (4).SC2Replay: 'NoneType' object has no attribute 'race'
Failed for /home/dominik/Links/SC2Reps/World of Sleepers LE (48).SC2Replay: local variable 'game_length' referenced before assignment
Failed for /home/dominik/Links/SC2Reps/Whitewater Line LE.SC2Replay: 'NoneType' object has no attribute 'race'
Failed for /home/dominik/Links/SC2Reps/Bone Temple LE.SC2Replay: 'NoneType' object has no attribute 'race'
Failed for /home/dominik/Links/SC2Reps/Acropolis LE (19).SC2Replay: 'NoneType' object has no attribute 'race'
Failed for /home/dominik/Links/SC2Reps/World of Sleepers LE (15).SC2Replay: 'NoneType' object has no attribute 'race'
Failed for /home/dominik/Links/SC2Reps/Ephemeron LE (25).SC2Replay: 'NoneType' object has no attribute 'race'
--- Logging error ---
Traceback (most recent call last):
  File "/home/dominik/.local/lib/python3.8/site-packages/zephyrus_sc2_parser/parser.py", line 154, in parse_replay
    players = create_players(player_info, events)
  File "/home/dominik/.local/lib/python3.8/site-packages/zephyrus_sc2_parser/utils.py", line 68, in create_players
    new_player.race = non_english_races[new_player.race.encode('utf-8')]
KeyError: b''

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/progs/miniconda3/envs/py38/lib/python3.8/logging/__init__.py", line 1081, in emit
    msg = self.format(record)
  File "/progs/miniconda3/envs/py38/lib/python3.8/logging/__init__.py", line 925, in format
    return fmt.format(record)
  File "/progs/miniconda3/envs/py38/lib/python3.8/logging/__init__.py", line 664, in format
    record.message = record.getMessage()
  File "/progs/miniconda3/envs/py38/lib/python3.8/logging/__init__.py", line 369, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/progs/miniconda3/envs/py38/lib/python3.8/runpy.py", line 193, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/progs/miniconda3/envs/py38/lib/python3.8/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 583, in start
    self.io_loop.start()
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 149, in start
    self.asyncio_loop.run_forever()
  File "/progs/miniconda3/envs/py38/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
    self._run_once()
  File "/progs/miniconda3/envs/py38/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
    handle._run()
  File "/progs/miniconda3/envs/py38/lib/python3.8/asyncio/events.py", line 81, in _run
    self._context.run(self._callback, *self._args)
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/tornado/ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/tornado/gen.py", line 787, in inner
    self.run()
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 361, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 539, in execute_request
    self.do_execute(
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 300, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2857, in run_cell
    result = self._run_cell(
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2886, in _run_cell
    return runner(coro)
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3062, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3254, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/progs/miniconda3/envs/py38/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-d51f8f2ea671>", line 16, in <module>
    replay = zephyrus_sc2_parser.parse_replay(replay_file, local=True)
  File "/home/dominik/.local/lib/python3.8/site-packages/zephyrus_sc2_parser/parser.py", line 162, in parse_replay
    logging.critical('A KeyError error occured:', error, 'unreadable file info')
Message: 'A KeyError error occured:'
Arguments: (KeyError(b''), 'unreadable file info')
Failed for /home/dominik/Links/SC2Reps/Eternal Empire LE (20).SC2Replay: local variable 'game_length' referenced before assignment

And I have absolutely no idea how to explain the Logging error. We aren't missing out on many games, though:

In [2]:
print(f"We successfully parsed {len(parsed_replays)} replays, which is {len(parsed_replays)/len(replays):.2%} of the total!")
We successfully parsed 464 replays, which is 97.68% of the total!

That was the first step; now, we continue to...

Pull the interesting data

Note that this mostly handles 1v1 data; it might be a bit more difficult to filter out stuff such as coop and team games. I would probably recommend filtering them out at an earlier stage, by filename.

In [8]:
# utility function to get our own player ID
def grab_player_id(players, name = PLAYER_NAME):
    for key, player in players.items():
        if player.name == name:
            break        
    else:
        key = None
    return key


results = []
for replay_file, replay in parsed_replays.items():
    players, timeline, engagements, summary, meta = replay
    if all(item is None for item in replay):
        print(f"Failed to parse for {replay_file}")
        continue
    my_id = grab_player_id(players, PLAYER_NAME)
    enemy_id = 1 if (my_id == 2) else 2
    
    results.append(
        dict(
            replay_file = replay_file,
            time_played_at = meta['time_played_at'],
            win = meta["winner"] == my_id,
            
            race = players[my_id].race,
            enemy_race = players[enemy_id].race,
            
            mmr = summary['mmr'][my_id],
            
            enemy_mmr = summary['mmr'][enemy_id],
            enemy_nickame = players[enemy_id].name,
            
            map_name = meta["map"],
            duration = meta['game_length'],
        )
    )

print(f"We successfully pulled data out of {len(results)} replays, which is {len(results)/len(replays):.2%} of the total!")
Failed to parse for /home/dominik/Links/SC2Reps/Ephemeron LE (24).SC2Replay
Failed to parse for /home/dominik/Links/SC2Reps/Malwarfare.SC2Replay
We successfully pulled data out of 462 replays, which is 97.26% of the total!

What I'm showing you here is the end result, but if you wanted to add some other metrics, you might be interested in the answer to:

How do I pick the interesting data?

We'll use the entries from the last replay. Most of them are dictionaries, so it's pretty easy to get access to their contents:

In [42]:
meta
Out[42]:
{'time_played_at': datetime.datetime(2020, 4, 30, 18, 34, 1, tzinfo=<UTC>),
 'map': 'Nightshade LE',
 'game_length': 1040,
 'winner': 2}

If you run this notebook locally, IPython has a nice widget to browse this data. If you're reading this on the website, you'll probably unfortunately see only <IPython.core.display.JSON object>:

In [13]:
from IPython.display import JSON
JSON(summary)
Out[13]:
<IPython.core.display.JSON object>
In [44]:
summary.keys()
Out[44]:
dict_keys(['mmr', 'avg_resource_collection_rate', 'avg_unspent_resources', 'apm', 'spm', 'resources_lost', 'resources_collected', 'workers_produced', 'workers_killed', 'workers_lost', 'supply_block', 'sq', 'avg_pac_per_min', 'avg_pac_action_latency', 'avg_pac_actions', 'avg_pac_gap', 'race'])

As you can (possibly) see, there's plenty of interesting data that I might use sometime. Beyond what we're already pulling out:

  • Average resource collection rate
  • the spending quotient, a (possibly flawed) measure of macro skill
  • time spent supply blocked
  • workers lost, killed and produced
  • per-race statistics:
    • Orbital Command energy efficiency and idle time
    • likewise for Nexii (Nexuses?)
    • Splash efficiency for Protoss

I probably wouldn't use Bayesian inference on all of them, though - it gets hard to come up with a model that involves all of them. Maybe a random forest model would be nice?

Either way, once we've found something interesting it's simple to access the fields:

In [45]:
summary['apm'][1]
Out[45]:
151.0

It's a bit more difficult to pull data out of players, as there are dedicated objects storing the data there; we can still make do:

In [46]:
clean_data = {}
for player_id, player in players.items():
    d = player.__dict__.copy()
    # we have to drop some data that contains custom objects:
    for dropped_key in ["current_selection", "objects", "control_groups", "pac_list", "current_pac", "active_ability"]:
        d.pop(dropped_key)
    clean_data[player_id] = d
JSON(clean_data)
Out[46]:
<IPython.core.display.JSON object>

I'll showcase a few:

In [48]:
players[2].upgrades
Out[48]:
['WarpGateResearch',
 'ExtendedThermalLance',
 'Charge',
 'ProtossGroundWeaponsLevel1',
 'PsiStormTech',
 'GraviticDrive',
 'BlinkTech',
 'ProtossGroundWeaponsLevel2',
 'AdeptPiercingAttack',
 'ProtossGroundArmorsLevel1',
 'ProtossGroundWeaponsLevel3',
 'ProtossShieldsLevel1']
In [50]:
players[2].supply_block
Out[50]:
1568
In [52]:
players[2].resources_collected
Out[52]:
{'minerals': 31370, 'gas': 11109}

A bunch of these keys, such as unspent_resources, are time data, taken at discrete snapshots during the game. There's more time data, of course, in timeline:

In [40]:
JSON(timeline)
Out[40]:
<IPython.core.display.JSON object>

And I haven't yet been able to figure this one out:

In [54]:
engagements
Out[54]:
[]

Saving our results to DataFrame, then to CSV

We'll also calculate the MMR difference at this step.

In [4]:
import pandas as pd
df = pd.DataFrame(results)
df['mmr_diff'] = df.mmr - df.enemy_mmr
df
Out[4]:
time_played_at win race enemy_race mmr enemy_mmr enemy_nickame map_name duration mmr_diff
0 2020-05-27 10:32:29+00:00 True Protoss Terran 4004 4173 giletjaune Nightshade LE 601 -169
1 2020-06-09 17:11:15+00:00 False Protoss Zerg 4186 4147 djakette Eternal Empire LE 420 39
2 2020-02-02 17:27:27+00:00 True Protoss Terran 3971 3913 Syocto Ephemeron LE 10 58
3 2019-12-20 18:53:00+00:00 True Zerg Terran 2984 3090 Jason Simulacrum LE 569 -106
4 2019-12-09 20:36:21+00:00 True Protoss Zerg 4015 4024 &lt;OGCOСK&gt;<sp/>ShushYo Nightshade LE 454 -9
... ... ... ... ... ... ... ... ... ... ...
457 2019-11-04 20:53:20+00:00 False Protoss Terran 3800 3883 &lt;MiClan&gt;<sp/>MiSHANYA Disco Bloodbath LE 396 -83
458 2020-05-04 12:43:06+00:00 True Protoss Terran 3926 3831 StaMinA Golden Wall LE 784 95
459 2020-02-02 17:15:06+00:00 False Protoss Zerg 4012 4092 &lt;0mg&gt;<sp/>Sroljo World of Sleepers LE 264 -80
460 2020-04-19 11:48:32+00:00 True Protoss Zerg 0 0 shadowofmich Simulacrum LE 297 0
461 2020-04-30 18:34:01+00:00 True Protoss Terran 3964 4055 &lt;BRs&gt;<sp/>GoodFellas Nightshade LE 1040 -91

462 rows × 10 columns

And we dump that to CSV, and we're done!

In [5]:
df.to_csv("/home/dominik/Writing/blog/files/replays.csv")

TL;DR version

Feel free to take this script and modify as you see fit!

In [ ]:
import pathlib
import warnings

import tqdm.auto as tqdm
import zephyrus_sc2_parser

REPLAY_DIRECTORY = "/home/dominik/Links/SC2Reps"
PLAYER_NAME = "Perfi"
OUTPUT_CSV = "/home/dominik/Writing/blog/files/replays.csv"

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    replays = list(pathlib.Path(REPLAY_DIRECTORY).glob("*.SC2Replay"))
    parsed_replays = {}
    for replay_file in tqdm.tqdm(replays):
        try:
            replay = zephyrus_sc2_parser.parse_replay(replay_file, local=True)
        except Exception as e:
            print(f"Failed for {replay_file}: {e}")
            continue
        parsed_replays[replay_file] = replay
        
print(f"We successfully pulled data out of {len(results)} replays, which is {len(results)/len(replays):.2%} of the total!")

# utility function to get our own player ID
def grab_player_id(players, name = PLAYER_NAME):
    for key, player in players.items():
        if player.name == name:
            break        
    else:
        key = None
    return key


results = []
for replay_file, replay in parsed_replays.items():
    players, timeline, engagements, summary, meta = replay
    if all(item is None for item in replay):
        print(f"Failed to parse for {replay_file}")
        continue
    my_id = grab_player_id(players, PLAYER_NAME)
    enemy_id = 1 if (my_id == 2) else 2
    
    mmr = summary['mmr'][my_id]

    enemy_mmr = summary['mmr'][enemy_id]
    results.append(
        dict(
            replay_file = replay_file,
            time_played_at = meta['time_played_at'],
            win = meta["winner"] == my_id,
            mmr=mmr,
            enemy_mmr=enemy_mmr,
            mmr_diff = mmr - enemy_mmr
            race = players[my_id].race,
            enemy_race = players[enemy_id].race,
            enemy_nickame = players[enemy_id].name,
            map_name = meta["map"],
            duration = meta['game_length'],
        )
    )

print(f"We successfully pulled data out of {len(results)} replays, which is {len(results)/len(replays):.2%} of the total!")

import pandas as pd
df = pd.DataFrame(results)
df['mmr_diff'] = df.mmr - df.enemy_mmr
df.to_csv(OUTPUT_CSV)

If you have questions about this sort of thing, I'll be happy to help - ask away! :)

Comments