Data Cleaning

Set Up

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(dplyr)

Import the data

df_general <- read_csv("data/generalesport.csv")
Rows: 669 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Game, Genre
dbl (6): ReleaseDate, TotalEarnings, OfflineEarnings, PercentOffline, TotalP...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_historical <- read_csv("data/historicalesport.csv")
Rows: 10239 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (1): Game
dbl  (3): Earnings, Players, Tournaments
date (1): Date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspect data

glimpse(df_general)
Rows: 669
Columns: 8
$ Game             <chr> "Age of Empires", "Age of Empires II", "Age of Empire…
$ ReleaseDate      <dbl> 1997, 1999, 2005, 2021, 2011, 2002, 2018, 2019, 2018,…
$ Genre            <chr> "Strategy", "Strategy", "Strategy", "Strategy", "Stra…
$ TotalEarnings    <dbl> 736284.75, 3898508.73, 122256.72, 1190813.44, 11462.9…
$ OfflineEarnings  <dbl> 522378.17, 1361409.22, 44472.60, 439117.93, 775.00, 8…
$ PercentOffline   <dbl> 0.70947846, 0.34921282, 0.36376405, 0.36875460, 0.067…
$ TotalPlayers     <dbl> 624, 2256, 172, 643, 52, 236, 14, 133, 698, 1158, 90,…
$ TotalTournaments <dbl> 341, 1939, 179, 423, 68, 298, 8, 52, 207, 779, 29, 22…
glimpse(df_historical)
Rows: 10,239
Columns: 5
$ Date        <date> 1998-01-01, 1998-01-01, 1998-05-01, 1998-05-01, 1998-05-0…
$ Game        <chr> "Command & Conquer: Red Alert", "QuakeWorld", "Quake II", …
$ Earnings    <dbl> 15000.00, 15000.00, 15000.00, 15000.00, 300.00, 6500.00, 3…
$ Players     <dbl> 8, 8, 4, 2, 3, 4, 13, 17, 8, 1, 5, 2, 8, 8, 1, 9, 15, 6, 1…
$ Tournaments <dbl> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 4…

Data Cleaning

Rename Data

df_general1 <- df_general |>
  rename("game1"="Game",
         "year"="ReleaseDate",
         "genre"="Genre",
         "totalearn"="TotalEarnings",
         "offlineearn"="OfflineEarnings",
         "percent"="PercentOffline",
         "totallayer"="TotalPlayers",
         "tournament"="TotalTournaments")
df_historical_cleaned <- df_historical |>
  rename("date" = "Date",
         "game2"="Game",
         "earn"="Earnings",
         "player" = "Players",
         "tournament" = "Tournaments")|>
  filter(date < "2024-01-01") 
# Remove data after 2024 as it only goes through October

Check Data

glimpse(df_general1)
Rows: 669
Columns: 8
$ game1       <chr> "Age of Empires", "Age of Empires II", "Age of Empires III…
$ year        <dbl> 1997, 1999, 2005, 2021, 2011, 2002, 2018, 2019, 2018, 2007…
$ genre       <chr> "Strategy", "Strategy", "Strategy", "Strategy", "Strategy"…
$ totalearn   <dbl> 736284.75, 3898508.73, 122256.72, 1190813.44, 11462.98, 18…
$ offlineearn <dbl> 522378.17, 1361409.22, 44472.60, 439117.93, 775.00, 86723.…
$ percent     <dbl> 0.70947846, 0.34921282, 0.36376405, 0.36875460, 0.06760895…
$ totallayer  <dbl> 624, 2256, 172, 643, 52, 236, 14, 133, 698, 1158, 90, 134,…
$ tournament  <dbl> 341, 1939, 179, 423, 68, 298, 8, 52, 207, 779, 29, 22, 81,…
skim(df_general1)
Data summary
Name df_general1
Number of rows 669
Number of columns 8
_______________________
Column type frequency:
character 2
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
game1 0 1 3 53 0 669 0
genre 0 1 6 31 0 12 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2012.79 7.55 1981 2007.00 2015.00 2019.0 2025 ▁▁▅▇▇
totalearn 0 1.00 2828052.65 18809733.23 0 1174.50 39279.73 308794.7 360461163 ▇▁▁▁▁
offlineearn 0 1.00 1993359.64 14549047.05 0 0.00 18553.65 212540.3 313554838 ▇▁▁▁▁
percent 61 0.91 0.66 0.41 0 0.24 0.91 1.0 1 ▃▁▁▁▇
totallayer 0 1.00 247.95 1034.40 0 5.00 25.00 106.0 16518 ▇▁▁▁▁
tournament 0 1.00 100.24 502.82 0 1.00 5.00 29.0 7510 ▇▁▁▁▁

Remove the missing data and 2024 data for general dataset

df_general_cleaned <- df_general1 |>
  filter(!is.na(percent)) |>
  filter(year != 2024)
glimpse(df_general_cleaned)
Rows: 601
Columns: 8
$ game1       <chr> "Age of Empires", "Age of Empires II", "Age of Empires III…
$ year        <dbl> 1997, 1999, 2005, 2021, 2011, 2002, 2018, 2019, 2018, 2007…
$ genre       <chr> "Strategy", "Strategy", "Strategy", "Strategy", "Strategy"…
$ totalearn   <dbl> 736284.75, 3898508.73, 122256.72, 1190813.44, 11462.98, 18…
$ offlineearn <dbl> 522378.17, 1361409.22, 44472.60, 439117.93, 775.00, 86723.…
$ percent     <dbl> 0.70947846, 0.34921282, 0.36376405, 0.36875460, 0.06760895…
$ totallayer  <dbl> 624, 2256, 172, 643, 52, 236, 14, 133, 698, 1158, 90, 134,…
$ tournament  <dbl> 341, 1939, 179, 423, 68, 298, 8, 52, 207, 779, 29, 22, 81,…

Check Data

skim(df_general_cleaned)
Data summary
Name df_general_cleaned
Number of rows 601
Number of columns 8
_______________________
Column type frequency:
character 2
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
game1 0 1 3 53 0 601 0
genre 0 1 6 31 0 12 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2012.59 7.41 1985 2007.00 2015.00 2019.0 2025 ▁▂▅▇▇
totalearn 0 1 3145074.85 19821955.68 10 5000.00 57728.01 492000.0 360461163 ▇▁▁▁▁
offlineearn 0 1 2216125.77 15335317.45 0 220.00 30400.00 321008.3 313554838 ▇▁▁▁▁
percent 0 1 0.66 0.41 0 0.25 0.91 1.0 1 ▃▁▁▁▇
totallayer 0 1 275.11 1088.07 0 6.00 34.00 126.0 16518 ▇▁▁▁▁
tournament 0 1 111.36 529.39 1 2.00 7.00 40.0 7510 ▇▁▁▁▁
skim(df_historical_cleaned)
Data summary
Name df_historical_cleaned
Number of rows 9731
Number of columns 5
_______________________
Column type frequency:
character 1
Date 1
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
game2 0 1 3 53 0 609 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 1998-01-01 2023-12-01 2018-06-01 299

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
earn 0 1 180971.83 1000303.37 0 1153.88 11000 74676.89 40140651 ▇▁▁▁▁
player 0 1 60.58 132.29 0 6.00 16 51.00 1719 ▇▁▁▁▁
tournament 0 1 6.64 12.25 1 1.00 2 6.00 172 ▇▁▁▁▁

Save Data

write_csv(df_general_cleaned,"data/general_cleaned.csv")
write_csv(df_historical_cleaned,"data/historical_cleaned.csv")