Data Cleaning

Set Up

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(skimr)
library(dplyr)

Import the data

df_general <- read_csv("data/generalesport.csv")

Rows: 669 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Game, Genre
dbl (6): ReleaseDate, TotalEarnings, OfflineEarnings, PercentOffline, TotalP...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df_historical <- read_csv("data/historicalesport.csv")

Rows: 10239 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (1): Game
dbl  (3): Earnings, Players, Tournaments
date (1): Date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspect data

glimpse(df_general)

Rows: 669
Columns: 8
$ Game             <chr> "Age of Empires", "Age of Empires II", "Age of Empire…
$ ReleaseDate      <dbl> 1997, 1999, 2005, 2021, 2011, 2002, 2018, 2019, 2018,…
$ Genre            <chr> "Strategy", "Strategy", "Strategy", "Strategy", "Stra…
$ TotalEarnings    <dbl> 736284.75, 3898508.73, 122256.72, 1190813.44, 11462.9…
$ OfflineEarnings  <dbl> 522378.17, 1361409.22, 44472.60, 439117.93, 775.00, 8…
$ PercentOffline   <dbl> 0.70947846, 0.34921282, 0.36376405, 0.36875460, 0.067…
$ TotalPlayers     <dbl> 624, 2256, 172, 643, 52, 236, 14, 133, 698, 1158, 90,…
$ TotalTournaments <dbl> 341, 1939, 179, 423, 68, 298, 8, 52, 207, 779, 29, 22…

glimpse(df_historical)

Rows: 10,239
Columns: 5
$ Date        <date> 1998-01-01, 1998-01-01, 1998-05-01, 1998-05-01, 1998-05-0…
$ Game        <chr> "Command & Conquer: Red Alert", "QuakeWorld", "Quake II", …
$ Earnings    <dbl> 15000.00, 15000.00, 15000.00, 15000.00, 300.00, 6500.00, 3…
$ Players     <dbl> 8, 8, 4, 2, 3, 4, 13, 17, 8, 1, 5, 2, 8, 8, 1, 9, 15, 6, 1…
$ Tournaments <dbl> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 4…

Data Cleaning

Rename Data

df_general1 <- df_general |>
  rename("game1"="Game",
         "year"="ReleaseDate",
         "genre"="Genre",
         "totalearn"="TotalEarnings",
         "offlineearn"="OfflineEarnings",
         "percent"="PercentOffline",
         "totallayer"="TotalPlayers",
         "tournament"="TotalTournaments")
df_historical_cleaned <- df_historical |>
  rename("date" = "Date",
         "game2"="Game",
         "earn"="Earnings",
         "player" = "Players",
         "tournament" = "Tournaments")|>
  filter(date < "2024-01-01") 
# Remove data after 2024 as it only goes through October

Check Data

glimpse(df_general1)

Rows: 669
Columns: 8
$ game1       <chr> "Age of Empires", "Age of Empires II", "Age of Empires III…
$ year        <dbl> 1997, 1999, 2005, 2021, 2011, 2002, 2018, 2019, 2018, 2007…
$ genre       <chr> "Strategy", "Strategy", "Strategy", "Strategy", "Strategy"…
$ totalearn   <dbl> 736284.75, 3898508.73, 122256.72, 1190813.44, 11462.98, 18…
$ offlineearn <dbl> 522378.17, 1361409.22, 44472.60, 439117.93, 775.00, 86723.…
$ percent     <dbl> 0.70947846, 0.34921282, 0.36376405, 0.36875460, 0.06760895…
$ totallayer  <dbl> 624, 2256, 172, 643, 52, 236, 14, 133, 698, 1158, 90, 134,…
$ tournament  <dbl> 341, 1939, 179, 423, 68, 298, 8, 52, 207, 779, 29, 22, 81,…

skim(df_general1)

Data summary
Name	df_general1
Number of rows	669
Number of columns	8
_______________________
Column type frequency:
character	2
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
game1	0	1	3	53	0	669	0
genre	0	1	6	31	0	12	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2012.79	7.55	1981	2007.00	2015.00	2019.0	2025	▁▁▅▇▇
totalearn	0	1.00	2828052.65	18809733.23	0	1174.50	39279.73	308794.7	360461163	▇▁▁▁▁
offlineearn	0	1.00	1993359.64	14549047.05	0	0.00	18553.65	212540.3	313554838	▇▁▁▁▁
percent	61	0.91	0.66	0.41	0	0.24	0.91	1.0	1	▃▁▁▁▇
totallayer	0	1.00	247.95	1034.40	0	5.00	25.00	106.0	16518	▇▁▁▁▁
tournament	0	1.00	100.24	502.82	0	1.00	5.00	29.0	7510	▇▁▁▁▁

Remove the missing data and 2024 data for general dataset

df_general_cleaned <- df_general1 |>
  filter(!is.na(percent)) |>
  filter(year != 2024)
glimpse(df_general_cleaned)

Rows: 601
Columns: 8
$ game1       <chr> "Age of Empires", "Age of Empires II", "Age of Empires III…
$ year        <dbl> 1997, 1999, 2005, 2021, 2011, 2002, 2018, 2019, 2018, 2007…
$ genre       <chr> "Strategy", "Strategy", "Strategy", "Strategy", "Strategy"…
$ totalearn   <dbl> 736284.75, 3898508.73, 122256.72, 1190813.44, 11462.98, 18…
$ offlineearn <dbl> 522378.17, 1361409.22, 44472.60, 439117.93, 775.00, 86723.…
$ percent     <dbl> 0.70947846, 0.34921282, 0.36376405, 0.36875460, 0.06760895…
$ totallayer  <dbl> 624, 2256, 172, 643, 52, 236, 14, 133, 698, 1158, 90, 134,…
$ tournament  <dbl> 341, 1939, 179, 423, 68, 298, 8, 52, 207, 779, 29, 22, 81,…

Check Data

skim(df_general_cleaned)

Data summary
Name	df_general_cleaned
Number of rows	601
Number of columns	8
_______________________
Column type frequency:
character	2
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
game1	0	1	3	53	0	601	0
genre	0	1	6	31	0	12	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	1	2012.59	7.41	1985	2007.00	2015.00	2019.0	2025	▁▂▅▇▇
totalearn	1	3145074.85	19821955.68	10	5000.00	57728.01	492000.0	360461163	▇▁▁▁▁
offlineearn	1	2216125.77	15335317.45	0	220.00	30400.00	321008.3	313554838	▇▁▁▁▁
percent	1	0.66	0.41	0	0.25	0.91	1.0	1	▃▁▁▁▇
totallayer	1	275.11	1088.07	0	6.00	34.00	126.0	16518	▇▁▁▁▁
tournament	1	111.36	529.39	1	2.00	7.00	40.0	7510	▇▁▁▁▁

skim(df_historical_cleaned)

Data summary
Name	df_historical_cleaned
Number of rows	9731
Number of columns	5
_______________________
Column type frequency:
character	1
Date	1
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
game2	0	1	3	53	0	609	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
date	0	1	1998-01-01	2023-12-01	2018-06-01	299

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
earn	1	180971.83	1000303.37	0	1153.88	11000	74676.89	40140651	▇▁▁▁▁
player	1	60.58	132.29	0	6.00	16	51.00	1719	▇▁▁▁▁
tournament	1	6.64	12.25	1	1.00	2	6.00	172	▇▁▁▁▁

Save Data

write_csv(df_general_cleaned,"data/general_cleaned.csv")
write_csv(df_historical_cleaned,"data/historical_cleaned.csv")