Grand Slam Point-by-Point Analysis

Tennis

Data Analysis

Serve patterns, rally length, break point pressure, distance run, and long-point fatigue across all four Grand Slams (2011–2020)

Published

March 12, 2026

Data

Point-by-point data from Jeff Sackmann’s Grand Slam repository. Covers matches on courts with the Hawkeye tracking system — typically from the second week onward plus select early-round showcourts. Australian Open runs 2011–2020; the other three slams run 2011–2019.

Code

data_dir <- here::here("data-projects", "data")

slams     <- c("ausopen", "frenchopen", "usopen", "wimbledon")
ao_years  <- 2011:2020
other_years <- 2011:2019

slam_years <- bind_rows(
  tibble(slam = "ausopen",    year = ao_years),
  tibble(slam = "frenchopen", year = other_years),
  tibble(slam = "usopen",     year = other_years),
  tibble(slam = "wimbledon",  year = other_years)
)

base_url <- "https://raw.githubusercontent.com/JeffSackmann/tennis_slam_pointbypoint/master/"

# ── Matches ──────────────────────────────────────────────────────────────────
matches_cache <- file.path(data_dir, "slam_matches.csv")

if (!file.exists(matches_cache)) {
  matches_raw <- pmap_dfr(slam_years, function(slam, year) {
    url <- paste0(base_url, year, "-", slam, "-matches.csv")
    tryCatch(
      read_csv(url, show_col_types = FALSE, col_types = cols(.default = "c")) |>
        mutate(slam = slam, year = year),
      error = \(e) tibble()
    )
  })
  write_csv(matches_raw, matches_cache)
} else {
  matches_raw <- read_csv(matches_cache, show_col_types = FALSE)
}

# ── Points ────────────────────────────────────────────────────────────────────
points_cache <- file.path(data_dir, "slam_points.csv")

keep_cols <- c(
  "match_id", "SetNo", "GameNo", "PointNumber", "PointServer",
  "P1GamesWon", "P2GamesWon", "P1PointsWon", "P2PointsWon",
  "SetWinner", "GameWinner", "PointWinner",
  "P1Score", "P2Score",
  "Speed_KMH", "ServeWidth", "ServeIndicator", "ServeDepth", "ReturnDepth",
  "RallyCount",
  "P1Ace", "P2Ace", "P1Winner", "P2Winner",
  "P1DoubleFault", "P2DoubleFault",
  "P1UnfErr", "P2UnfErr",
  "P1NetPoint", "P2NetPoint",
  "P1BreakPoint", "P2BreakPoint",
  "P1BreakPointWon", "P2BreakPointWon",
  "P1BreakPointMissed", "P2BreakPointMissed",
  "P1DistanceRun", "P2DistanceRun"
)

if (!file.exists(points_cache)) {
  points_raw <- pmap_dfr(slam_years, function(slam, year) {
    url <- paste0(base_url, year, "-", slam, "-points.csv")
    tryCatch({
      df <- read_csv(url, show_col_types = FALSE, col_types = cols(.default = "c"))
      # keep only columns that exist in this file
      df[intersect(keep_cols, names(df))] |>
        mutate(slam = slam, year = as.integer(year))
    }, error = \(e) tibble())
  })
  write_csv(points_raw, points_cache)
} else {
  points_raw <- read_csv(points_cache, show_col_types = FALSE)
}

# ── Clean & type-cast ─────────────────────────────────────────────────────────
pts <- points_raw |>
  mutate(
    across(any_of(c("year", "SetNo", "GameNo", "PointNumber", "PointServer",
                    "PointWinner", "GameWinner", "RallyCount",
                    "P1Ace", "P2Ace", "P1Winner", "P2Winner",
                    "P1DoubleFault", "P2DoubleFault",
                    "P1UnfErr", "P2UnfErr",
                    "P1BreakPoint", "P2BreakPoint",
                    "P1BreakPointWon", "P2BreakPointWon")),
           \(x) suppressWarnings(as.integer(x))),
    across(any_of(c("Speed_KMH", "P1DistanceRun", "P2DistanceRun")),
           \(x) suppressWarnings(as.numeric(x))),
    slam_label = slam_labels[slam]
  )

matches <- matches_raw |>
  mutate(year = as.integer(year))

Coverage overview

Not every match is tracked — Hawkeye data is only available for select courts. This heatmap shows how many matches have point data per slam per year.

Code

pts |>
  distinct(match_id, slam, year) |>
  count(slam, year) |>
  mutate(slam_label = slam_labels[slam]) |>
  ggplot(aes(factor(year), slam_label, fill = n)) +
  geom_tile(colour = "white", linewidth = 0.5) +
  geom_text(aes(label = n), size = 3.5, colour = "white", fontface = "bold") +
  scale_fill_gradient(low = "#9ECAE1", high = "#08519C") +
  labs(
    title = "Matches with point-by-point data",
    x = NULL, y = NULL, fill = "Matches"
  ) +
  theme(legend.position = "right", axis.text.x = element_text(angle = 45, hjust = 1))

Rally length by slam

Clay courts at Roland Garros produce the longest rallies; grass at Wimbledon the shortest. The distributions below pool all points by slam.

Code

pts |>
  filter(!is.na(RallyCount), RallyCount >= 1, RallyCount <= 40) |>
  mutate(slam_label = fct_reorder(slam_label, RallyCount, median, .desc = TRUE)) |>
  ggplot(aes(RallyCount, fill = slam)) +
  geom_histogram(binwidth = 1, colour = NA, alpha = 0.85) +
  scale_fill_manual(values = slam_cols, labels = slam_labels) +
  facet_wrap(~slam_label, scales = "free_y") +
  labs(
    title = "Rally length distribution by slam",
    subtitle = "All tracked points, 2011–2020",
    x = "Rally length (shots)", y = "Points", fill = NULL
  ) +
  theme(legend.position = "none")

Code

pts |>
  filter(!is.na(RallyCount), RallyCount >= 1) |>
  group_by(slam, year) |>
  summarise(
    median_rally = median(RallyCount),
    mean_rally   = mean(RallyCount),
    .groups = "drop"
  ) |>
  mutate(slam_label = slam_labels[slam]) |>
  ggplot(aes(year, median_rally, colour = slam)) +
  geom_line(linewidth = 0.9) +
  geom_point(size = 2) +
  scale_colour_manual(values = slam_cols, labels = slam_labels) +
  labs(
    title = "Median rally length over time by slam",
    x = NULL, y = "Median rally length (shots)", colour = NULL
  )

Serve speed

First serves are substantially faster than second serves at every slam. Hard-court slams (AO, USO) tend to produce slightly higher serve speeds than clay or grass.

Code

speed_df <- pts |>
  filter(!is.na(Speed_KMH), Speed_KMH > 100, Speed_KMH < 260,
         ServeIndicator %in% c("1", "2")) |>
  mutate(
    serve = if_else(ServeIndicator == "1", "1st serve", "2nd serve"),
    slam_label = slam_labels[slam]
  )

if (nrow(speed_df) > 0) {
  speed_df |>
    ggplot(aes(Speed_KMH, fill = slam)) +
    geom_density(alpha = 0.65, colour = NA) +
    scale_fill_manual(values = slam_cols, labels = slam_labels) +
    facet_grid(serve ~ slam_label) +
    labs(
      title = "Serve speed distribution by slam and serve number",
      x = "Speed (km/h)", y = "Density", fill = NULL
    ) +
    theme(legend.position = "none")
}

Code

pts |>
  filter(!is.na(Speed_KMH), Speed_KMH > 100, Speed_KMH < 260,
         ServeIndicator %in% c("1", "2")) |>
  mutate(serve = if_else(ServeIndicator == "1", "1st serve", "2nd serve")) |>
  group_by(slam, year, serve) |>
  summarise(median_speed = median(Speed_KMH), .groups = "drop") |>
  mutate(slam_label = slam_labels[slam]) |>
  ggplot(aes(year, median_speed, colour = slam, linetype = serve)) +
  geom_line(linewidth = 0.9) +
  geom_point(size = 2) +
  scale_colour_manual(values = slam_cols, labels = slam_labels) +
  labs(
    title = "Median serve speed over time",
    x = NULL, y = "Median speed (km/h)", colour = NULL, linetype = NULL
  )

Serve direction under pressure

Does a server change their target when facing a break point? ServeWidth encodes: W (wide), B/BC/BW (body), C (centre/T).

Code

dir_df <- pts |>
  filter(!is.na(ServeWidth), ServeWidth != "", ServeIndicator == "1") |>
  mutate(
    direction = case_when(
      ServeWidth %in% c("W", "BW") ~ "Wide",
      ServeWidth %in% c("B", "BC") ~ "Body",
      ServeWidth == "C"            ~ "Centre/T",
      TRUE                         ~ NA_character_
    ),
    pressure = case_when(
      PointServer == 1 & P1BreakPoint == 1 ~ "Break point against server",
      PointServer == 2 & P2BreakPoint == 1 ~ "Break point against server",
      TRUE                                  ~ "Normal point"
    )
  ) |>
  filter(!is.na(direction)) |>
  count(slam, pressure, direction) |>
  group_by(slam, pressure) |>
  mutate(pct = n / sum(n), slam_label = slam_labels[slam]) |>
  ungroup()

if (nrow(dir_df) > 0) {
  dir_df |>
    ggplot(aes(direction, pct, fill = pressure)) +
    geom_col(position = "dodge", width = 0.7) +
    scale_fill_manual(values = c("Normal point" = "#2171B5",
                                  "Break point against server" = "#CB4335")) +
    scale_y_continuous(labels = percent) +
    facet_wrap(~slam_label) +
    labs(
      title = "1st serve direction: normal vs break point",
      subtitle = "Does pressure shift the target?",
      x = NULL, y = "Share of 1st serves", fill = NULL
    )
}

Break point conversion

Code

bp_data <- pts |>
  filter(!is.na(P1BreakPoint), !is.na(P2BreakPoint)) |>
  mutate(
    bp_opp  = P1BreakPoint + P2BreakPoint,
    bp_won  = P1BreakPointWon + P2BreakPointWon
  ) |>
  filter(bp_opp == 1) |>
  group_by(slam, year) |>
  summarise(
    conversion = mean(bp_won == 1, na.rm = TRUE),
    n_bp = n(),
    .groups = "drop"
  ) |>
  filter(n_bp >= 20) |>
  mutate(slam_label = slam_labels[slam])

if (nrow(bp_data) > 0) {
  bp_data |>
    ggplot(aes(year, conversion, colour = slam)) +
    geom_line(linewidth = 0.9) +
    geom_point(size = 2) +
    scale_colour_manual(values = slam_cols, labels = slam_labels) +
    scale_y_continuous(labels = percent) +
    labs(
      title = "Break point conversion rate over time",
      subtitle = "Share of break point opportunities converted",
      x = NULL, y = "Conversion rate", colour = NULL
    )
}

Code

if (nrow(bp_data) > 0) {
  bp_data |>
    group_by(slam, slam_label) |>
    summarise(avg_conversion = mean(conversion), .groups = "drop") |>
    mutate(slam_label = fct_reorder(slam_label, avg_conversion)) |>
    ggplot(aes(avg_conversion, slam_label, fill = slam)) +
    geom_col(width = 0.6) +
    scale_fill_manual(values = slam_cols) +
    scale_x_continuous(labels = percent) +
    labs(title = "Average break point conversion by slam", x = "Conversion rate", y = NULL) +
    theme(legend.position = "none")
}

Distance run: winners vs losers

Do match winners consistently run more or less than their opponents? Distance run data reflects how much each player was pushed around the court.

Code

# winner column in matches may be a player name or "1"/"2" — try both
matches_winner <- matches_raw |>
  mutate(winner_int = suppressWarnings(as.integer(winner))) |>
  filter(!is.na(winner_int)) |>
  select(match_id, winner_int)

dist_data <- pts |>
  filter(!is.na(P1DistanceRun), !is.na(P2DistanceRun),
         P1DistanceRun > 0, P2DistanceRun > 0) |>
  inner_join(matches_winner, by = "match_id") |>
  mutate(
    winner_dist = if_else(winner_int == 1L, P1DistanceRun, P2DistanceRun),
    loser_dist  = if_else(winner_int == 1L, P2DistanceRun, P1DistanceRun)
  ) |>
  group_by(match_id, slam, year) |>
  summarise(
    winner_total = sum(winner_dist, na.rm = TRUE),
    loser_total  = sum(loser_dist,  na.rm = TRUE),
    .groups = "drop"
  ) |>
  filter(winner_total > 0, loser_total > 0) |>
  mutate(
    diff = winner_total - loser_total,
    slam_label = slam_labels[slam]
  )

if (nrow(dist_data) > 0) {
  dist_data |>
    pivot_longer(c(winner_total, loser_total), names_to = "result", values_to = "dist") |>
    mutate(result = if_else(result == "winner_total", "Match winner", "Match loser")) |>
    ggplot(aes(dist / 1000, fill = result)) +
    geom_density(alpha = 0.6, colour = NA) +
    scale_fill_manual(values = c("Match winner" = "#27AE60", "Match loser" = "#CB4335")) +
    scale_x_continuous(labels = \(x) paste0(x, "k")) +
    facet_wrap(~slam_label) +
    labs(
      title = "Total distance run per match: winners vs losers",
      subtitle = "Metres run by winner and loser across tracked matches",
      x = "Distance run (metres, thousands)", y = "Density", fill = NULL
    ) |> print()
} else {
  message("Distance run data not available in this dataset.")
}

Code

if (nrow(dist_data) > 0) {
  dist_data |>
    ggplot(aes(diff, fill = slam)) +
    geom_vline(xintercept = 0, linetype = "dashed", colour = "grey50") +
    geom_density(alpha = 0.65, colour = NA) +
    scale_fill_manual(values = slam_cols, labels = slam_labels) +
    labs(
      title = "Winner's distance advantage per match",
      subtitle = "Positive = winner ran more; negative = winner ran less",
      x = "Winner metres − Loser metres", y = "Density", fill = NULL
    )
}

After a long point: does the loser fade?

When a rally goes long, one player wins and one loses. Does the losing player carry any fatigue into the next point — or do elite players reset completely?

The test: for every point following a rally of length ≥ threshold, compute the win rate of the player who lost the previous long point, split by whether they are serving or returning. Compare against their overall win rate in that role.

Code

if (nrow(fatigue_df) == 0) stop("No fatigue data available.")

fatigue_df |>
  ggplot(aes(threshold, edge, colour = prev_loser_role)) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey50") +
  geom_line(linewidth = 1.1) +
  geom_point(aes(size = n)) +
  scale_colour_manual(values = c("Serving next" = "#2171B5", "Returning next" = "#CB4335")) +
  scale_size_continuous(range = c(2, 6), labels = comma) +
  scale_x_continuous(breaks = thresholds) +
  scale_y_continuous(labels = \(x) paste0(round(x * 100, 1), " pp")) +
  labs(
    title = "Long-point fatigue: does losing a long rally hurt the next point?",
    subtitle = "Win rate of previous-point loser on next point, relative to their baseline\nNegative = they win next point less often than expected",
    x = "Minimum rally length of previous point (shots)",
    y = "Win rate vs baseline (percentage points)",
    colour = "Role on next point",
    size = "Points (n)"
  )

Code

thr_main <- 10  # focus on rallies of 10+ shots

pts_lagged |>
  filter(prev_rally >= thr_main) |>
  group_by(slam, prev_loser_role) |>
  summarise(
    win_rate = mean(loser_won_next, na.rm = TRUE),
    n = n(),
    .groups = "drop"
  ) |>
  left_join(baseline, by = "prev_loser_role") |>
  mutate(
    edge = win_rate - baseline_win,
    slam_label = slam_labels[slam]
  ) |>
  ggplot(aes(slam_label, edge, fill = slam)) +
  geom_col(width = 0.6) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  scale_fill_manual(values = slam_cols) +
  scale_y_continuous(labels = \(x) paste0(round(x * 100, 1), " pp")) +
  facet_wrap(~prev_loser_role) +
  labs(
    title = "Fatigue effect by slam (rallies ≥ 10 shots)",
    subtitle = "Win rate of long-point loser on next point vs. baseline",
    x = NULL, y = "vs. baseline (pp)", fill = NULL
  ) +
  theme(legend.position = "none", axis.text.x = element_text(angle = 25, hjust = 1))

Code

fatigue_df |>
  filter(threshold %in% c(5, 10, 15)) |>
  mutate(
    win_pct      = percent(win_rate, accuracy = 0.1),
    baseline_pct = percent(baseline_win, accuracy = 0.1),
    edge_pp      = paste0(round(edge * 100, 2), " pp"),
    n            = comma(n)
  ) |>
  select(
    `Min rally` = threshold,
    Role = prev_loser_role,
    `Win rate (next point)` = win_pct,
    `Baseline win rate` = baseline_pct,
    `Difference` = edge_pp,
    `N` = n
  ) |>
  arrange(`Min rally`, Role) |>
  gt() |>
  tab_header(
    title    = "Long-point fatigue effect",
    subtitle = "Win rate of previous-point loser on the immediately following point"
  ) |>
  tab_style(
    style = cell_text(color = "#CB4335", weight = "bold"),
    locations = cells_body(
      columns = Difference,
      rows    = as.numeric(str_remove(Difference, " pp")) < 0
    )
  )

Long-point fatigue effect
Win rate of previous-point loser on the immediately following point
Min rally	Role	Win rate (next point)	Baseline win rate	Difference	N
5	Returning next	38.8%	37.5%	1.3 pp	82,993
5	Serving next	59.7%	60.5%	-0.81 pp	71,015
10	Returning next	39.8%	37.5%	2.28 pp	17,020
10	Serving next	59.4%	60.5%	-1.12 pp	21,661
15	Returning next	40.7%	37.5%	3.22 pp	5,490
15	Serving next	58.9%	60.5%	-1.59 pp	4,401

Playstyle Clusters — Last 30 Years

Every player has a statistical “recipe” — the blend of serving aggression, consistency, and defensive resilience that defines how they win matches. K-means clustering on six serve-and-match metrics groups ATP players (1996–2025, ≥ 50 matches) into four distinct archetypes.

Features:

Metric	What it captures
Ace rate	Serving firepower
Double-fault rate	Serve risk tolerance
1st serve in %	Serve consistency
1st serve win %	Dominance when 1st serve lands
2nd serve win %	Baseline / net quality under pressure
BP save rate	Clutch performance defending break points

Code

wss_df |>
  ggplot(aes(k, wss)) +
  geom_line(linewidth = 1) +
  geom_point(size = 3, colour = "#2171B5") +
  geom_vline(xintercept = 4, linetype = "dashed", colour = "#CB4335", linewidth = 0.7) +
  scale_x_continuous(breaks = 1:8) +
  labs(
    title    = "Choosing k: within-cluster sum of squares",
    subtitle = "Elbow at k = 4 — four distinct playstyle archetypes",
    x = "Number of clusters (k)", y = "Total within-cluster SS"
  )

Code

centroids |>
  left_join(style_map, by = "cluster") |>
  pivot_longer(all_of(feat_cols), names_to = "feature", values_to = "z") |>
  mutate(feature = factor(feat_labels[feature], levels = feat_labels)) |>
  ggplot(aes(feature, z, fill = style)) +
  geom_col(width = 0.7) +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "grey60") +
  scale_fill_manual(values = cluster_cols) +
  facet_wrap(~style, nrow = 2) +
  labs(
    title    = "Playstyle fingerprints — cluster centroids (z-scored)",
    subtitle = "Above zero = above average for that metric across all players",
    x = NULL, y = "Z-score vs. population mean", fill = NULL
  ) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 30, hjust = 1))

Code

player_profiles |>
  ggplot(aes(ace_rate, second_win_pct, colour = style, size = n_matches)) +
  geom_point(alpha = 0.5) +
  scale_colour_manual(values = cluster_cols) +
  scale_x_continuous(labels = percent) +
  scale_y_continuous(labels = percent) +
  scale_size_continuous(range = c(1, 4), guide = "none") +
  labs(
    title    = "Player playstyle clusters: Ace Rate vs 2nd Serve Win%",
    subtitle = "Dot size = career matches played  ·  ATP 1996–2025",
    x = "Ace Rate", y = "2nd Serve Win%", colour = NULL
  )

Code

player_profiles |>
  group_by(style) |>
  summarise(
    Players        = n(),
    `Win %`        = mean(win_pct),
    `Ace Rate`     = mean(ace_rate),
    `DF Rate`      = mean(df_rate),
    `1st In%`      = mean(first_in_pct),
    `1st Win%`     = mean(first_win_pct),
    `2nd Win%`     = mean(second_win_pct),
    `BP Save%`     = mean(bp_save_rate),
    .groups = "drop"
  ) |>
  arrange(desc(`Win %`)) |>
  gt() |>
  tab_header(
    title    = "Playstyles ranked by win %",
    subtitle = "ATP 1996–2025 · players with ≥ 50 matches · k-means (k = 4)"
  ) |>
  fmt_percent(
    columns  = c(`Win %`, `Ace Rate`, `DF Rate`, `1st In%`,
                 `1st Win%`, `2nd Win%`, `BP Save%`),
    decimals = 1
  ) |>
  fmt_number(columns = Players, decimals = 0) |>
  cols_label(style = "Playstyle") |>
  data_color(
    columns = `Win %`,
    palette = c("#FDECEA", "#A93226")
  ) |>
  tab_style(
    style     = cell_text(weight = "bold"),
    locations = cells_body(rows = 1)
  ) |>
  tab_footnote(
    footnote  = "Raw win % across all tour-level ATP matches; not adjusted for opposition strength.",
    locations = cells_column_labels(columns = `Win %`)
  )

Playstyles ranked by win %
ATP 1996–2025 · players with ≥ 50 matches · k-means (k = 4)
Playstyle	Players	Win %¹	Ace Rate	DF Rate	1st In%	1st Win%	2nd Win%	BP Save%
Big Server	92	55.3%	12.3%	4.2%	60.9%	76.1%	51.6%	64.5%
Aggressive Baseliner	187	48.0%	6.2%	3.2%	61.8%	69.9%	51.1%	60.7%
All-Court	135	43.7%	8.0%	5.0%	56.6%	72.4%	48.4%	59.9%
Counter-Puncher	158	38.1%	4.4%	3.9%	61.5%	66.5%	48.0%	56.8%
¹ Raw win % across all tour-level ATP matches; not adjusted for opposition strength.

Code

player_profiles |>
  group_by(style) |>
  slice_max(win_pct, n = 6, with_ties = FALSE) |>
  ungroup() |>
  arrange(style, desc(win_pct)) |>
  select(style, player, n_matches, win_pct,
         ace_rate, second_win_pct, bp_save_rate) |>
  gt() |>
  tab_header(
    title    = "Top players by playstyle",
    subtitle = "Highest win % within each cluster (min 50 matches · ATP 1996–2025)"
  ) |>
  fmt_percent(columns = c(win_pct, ace_rate, second_win_pct, bp_save_rate),
              decimals = 1) |>
  fmt_number(columns  = n_matches, decimals = 0) |>
  cols_label(
    style         = "Playstyle",  player        = "Player",
    n_matches     = "Matches",    win_pct       = "Win %",
    ace_rate      = "Ace Rate",   second_win_pct = "2nd Win%",
    bp_save_rate  = "BP Save%"
  ) |>
  tab_row_group(label = "Big Server",           rows = style == "Big Server") |>
  tab_row_group(label = "Aggressive Baseliner", rows = style == "Aggressive Baseliner") |>
  tab_row_group(label = "Counter-Puncher",      rows = style == "Counter-Puncher") |>
  tab_row_group(label = "All-Court",            rows = style == "All-Court") |>
  tab_style(
    style     = cell_fill(color = "#F4F4F4"),
    locations = cells_row_groups()
  ) |>
  data_color(
    columns = win_pct,
    palette = c("#FDECEA", "#A93226")
  )

Top players by playstyle
Highest win % within each cluster (min 50 matches · ATP 1996–2025)
Playstyle	Player	Matches	Win %	Ace Rate	2nd Win%	BP Save%
All-Court
All-Court	Lleyton Hewitt	817	70.1%	7.7%	52.7%	61.9%
All-Court	Yevgeny Kafelnikov	641	66.6%	7.4%	50.4%	60.9%
All-Court	Tim Henman	696	64.9%	7.4%	50.8%	63.1%
All-Court	Petr Korda	200	64.5%	6.3%	49.4%	62.3%
All-Court	Thomas Enqvist	506	60.3%	10.2%	50.2%	60.0%
All-Court	Magnus Gustafsson	262	59.2%	4.9%	49.4%	60.6%
Counter-Puncher
Counter-Puncher	Guillermo Coria	321	66.4%	3.8%	50.1%	58.6%
Counter-Puncher	Gaston Gaudio	444	57.4%	3.5%	49.9%	58.3%
Counter-Puncher	Alberto Berasategui	250	56.0%	2.3%	47.5%	56.5%
Counter-Puncher	Juan Ignacio Chela	574	53.5%	4.0%	50.3%	58.4%
Counter-Puncher	Diego Schwartzman	466	53.2%	2.5%	49.8%	56.8%
Counter-Puncher	Todd Woodbridge	192	52.6%	4.0%	47.0%	57.8%
Aggressive Baseliner
Aggressive Baseliner	Rafael Nadal	1,280	82.5%	4.3%	57.1%	66.0%
Aggressive Baseliner	Carlos Alcaraz	263	79.1%	5.1%	55.7%	63.8%
Aggressive Baseliner	Andy Murray	970	73.3%	8.5%	51.7%	62.0%
Aggressive Baseliner	Marcelo Rios	460	67.6%	6.5%	52.9%	61.2%
Aggressive Baseliner	Kei Nishikori	639	65.9%	4.1%	53.2%	60.9%
Aggressive Baseliner	David Nalbandian	544	65.8%	4.6%	52.3%	59.3%
Big Server
Big Server	Novak Djokovic	1,299	83.9%	7.3%	55.5%	65.6%
Big Server	Roger Federer	1,458	82.0%	10.0%	56.8%	67.3%
Big Server	Pete Sampras	400	77.5%	13.9%	52.5%	68.2%
Big Server	Jannik Sinner	340	76.8%	7.5%	54.8%	66.6%
Big Server	Andre Agassi	567	76.0%	7.1%	53.9%	65.1%
Big Server	Andy Roddick	773	74.5%	15.5%	55.9%	67.6%

--- title: "Grand Slam Point-by-Point Analysis" description: "Serve patterns, rally length, break point pressure, distance run, and long-point fatigue across all four Grand Slams (2011–2020)" date: "2026-03-12" categories: [Tennis, R, Data Analysis] execute: warning: false message: false freeze: true --- ```{r setup} #| echo: false library(tidyverse) library(scales) library(gt) theme_set( theme_minimal(base_size = 13) + theme( plot.title = element_text(face = "bold", size = 16), plot.subtitle = element_text(colour = "grey40", size = 12), panel.grid.minor = element_blank(), legend.position = "bottom" ) ) slam_cols <- c( "ausopen" = "#0057A8", "frenchopen" = "#C8593C", "usopen" = "#3D5A99", "wimbledon" = "#27AE60" ) slam_labels <- c( "ausopen" = "Australian Open", "frenchopen" = "French Open", "usopen" = "US Open", "wimbledon" = "Wimbledon" ) ``` ## Data Point-by-point data from [Jeff Sackmann's Grand Slam repository](https://github.com/JeffSackmann/tennis_slam_pointbypoint). Covers matches on courts with the Hawkeye tracking system — typically from the second week onward plus select early-round showcourts. Australian Open runs 2011–2020; the other three slams run 2011–2019. ```{r load-data} data_dir <- here::here("data-projects", "data") slams <- c("ausopen", "frenchopen", "usopen", "wimbledon") ao_years <- 2011:2020 other_years <- 2011:2019 slam_years <- bind_rows( tibble(slam = "ausopen", year = ao_years), tibble(slam = "frenchopen", year = other_years), tibble(slam = "usopen", year = other_years), tibble(slam = "wimbledon", year = other_years) ) base_url <- "https://raw.githubusercontent.com/JeffSackmann/tennis_slam_pointbypoint/master/" # ── Matches ────────────────────────────────────────────────────────────────── matches_cache <- file.path(data_dir, "slam_matches.csv") if (!file.exists(matches_cache)) { matches_raw <- pmap_dfr(slam_years, function(slam, year) { url <- paste0(base_url, year, "-", slam, "-matches.csv") tryCatch( read_csv(url, show_col_types = FALSE, col_types = cols(.default = "c")) |> mutate(slam = slam, year = year), error = \(e) tibble() ) }) write_csv(matches_raw, matches_cache) } else { matches_raw <- read_csv(matches_cache, show_col_types = FALSE) } # ── Points ──────────────────────────────────────────────────────────────────── points_cache <- file.path(data_dir, "slam_points.csv") keep_cols <- c( "match_id", "SetNo", "GameNo", "PointNumber", "PointServer", "P1GamesWon", "P2GamesWon", "P1PointsWon", "P2PointsWon", "SetWinner", "GameWinner", "PointWinner", "P1Score", "P2Score", "Speed_KMH", "ServeWidth", "ServeIndicator", "ServeDepth", "ReturnDepth", "RallyCount", "P1Ace", "P2Ace", "P1Winner", "P2Winner", "P1DoubleFault", "P2DoubleFault", "P1UnfErr", "P2UnfErr", "P1NetPoint", "P2NetPoint", "P1BreakPoint", "P2BreakPoint", "P1BreakPointWon", "P2BreakPointWon", "P1BreakPointMissed", "P2BreakPointMissed", "P1DistanceRun", "P2DistanceRun" ) if (!file.exists(points_cache)) { points_raw <- pmap_dfr(slam_years, function(slam, year) { url <- paste0(base_url, year, "-", slam, "-points.csv") tryCatch({ df <- read_csv(url, show_col_types = FALSE, col_types = cols(.default = "c")) # keep only columns that exist in this file df[intersect(keep_cols, names(df))] |> mutate(slam = slam, year = as.integer(year)) }, error = \(e) tibble()) }) write_csv(points_raw, points_cache) } else { points_raw <- read_csv(points_cache, show_col_types = FALSE) } # ── Clean & type-cast ───────────────────────────────────────────────────────── pts <- points_raw |> mutate( across(any_of(c("year", "SetNo", "GameNo", "PointNumber", "PointServer", "PointWinner", "GameWinner", "RallyCount", "P1Ace", "P2Ace", "P1Winner", "P2Winner", "P1DoubleFault", "P2DoubleFault", "P1UnfErr", "P2UnfErr", "P1BreakPoint", "P2BreakPoint", "P1BreakPointWon", "P2BreakPointWon")), \(x) suppressWarnings(as.integer(x))), across(any_of(c("Speed_KMH", "P1DistanceRun", "P2DistanceRun")), \(x) suppressWarnings(as.numeric(x))), slam_label = slam_labels[slam] ) matches <- matches_raw |> mutate(year = as.integer(year)) ``` ## Coverage overview Not every match is tracked — Hawkeye data is only available for select courts. This heatmap shows how many matches have point data per slam per year. ```{r coverage} #| fig-height: 4 #| fig-width: 9 pts |> distinct(match_id, slam, year) |> count(slam, year) |> mutate(slam_label = slam_labels[slam]) |> ggplot(aes(factor(year), slam_label, fill = n)) + geom_tile(colour = "white", linewidth = 0.5) + geom_text(aes(label = n), size = 3.5, colour = "white", fontface = "bold") + scale_fill_gradient(low = "#9ECAE1", high = "#08519C") + labs( title = "Matches with point-by-point data", x = NULL, y = NULL, fill = "Matches" ) + theme(legend.position = "right", axis.text.x = element_text(angle = 45, hjust = 1)) ``` ## Rally length by slam Clay courts at Roland Garros produce the longest rallies; grass at Wimbledon the shortest. The distributions below pool all points by slam. ```{r rally-dist} #| fig-height: 5 #| fig-width: 11 pts |> filter(!is.na(RallyCount), RallyCount >= 1, RallyCount <= 40) |> mutate(slam_label = fct_reorder(slam_label, RallyCount, median, .desc = TRUE)) |> ggplot(aes(RallyCount, fill = slam)) + geom_histogram(binwidth = 1, colour = NA, alpha = 0.85) + scale_fill_manual(values = slam_cols, labels = slam_labels) + facet_wrap(~slam_label, scales = "free_y") + labs( title = "Rally length distribution by slam", subtitle = "All tracked points, 2011–2020", x = "Rally length (shots)", y = "Points", fill = NULL ) + theme(legend.position = "none") ``` ```{r rally-trend} #| fig-height: 5 #| fig-width: 11 pts |> filter(!is.na(RallyCount), RallyCount >= 1) |> group_by(slam, year) |> summarise( median_rally = median(RallyCount), mean_rally = mean(RallyCount), .groups = "drop" ) |> mutate(slam_label = slam_labels[slam]) |> ggplot(aes(year, median_rally, colour = slam)) + geom_line(linewidth = 0.9) + geom_point(size = 2) + scale_colour_manual(values = slam_cols, labels = slam_labels) + labs( title = "Median rally length over time by slam", x = NULL, y = "Median rally length (shots)", colour = NULL ) ``` ## Serve speed First serves are substantially faster than second serves at every slam. Hard-court slams (AO, USO) tend to produce slightly higher serve speeds than clay or grass. ```{r serve-speed-dist} #| fig-height: 5 #| fig-width: 11 speed_df <- pts |> filter(!is.na(Speed_KMH), Speed_KMH > 100, Speed_KMH < 260, ServeIndicator %in% c("1", "2")) |> mutate( serve = if_else(ServeIndicator == "1", "1st serve", "2nd serve"), slam_label = slam_labels[slam] ) if (nrow(speed_df) > 0) { speed_df |> ggplot(aes(Speed_KMH, fill = slam)) + geom_density(alpha = 0.65, colour = NA) + scale_fill_manual(values = slam_cols, labels = slam_labels) + facet_grid(serve ~ slam_label) + labs( title = "Serve speed distribution by slam and serve number", x = "Speed (km/h)", y = "Density", fill = NULL ) + theme(legend.position = "none") } ``` ```{r serve-speed-trend} #| fig-height: 5 #| fig-width: 11 pts |> filter(!is.na(Speed_KMH), Speed_KMH > 100, Speed_KMH < 260, ServeIndicator %in% c("1", "2")) |> mutate(serve = if_else(ServeIndicator == "1", "1st serve", "2nd serve")) |> group_by(slam, year, serve) |> summarise(median_speed = median(Speed_KMH), .groups = "drop") |> mutate(slam_label = slam_labels[slam]) |> ggplot(aes(year, median_speed, colour = slam, linetype = serve)) + geom_line(linewidth = 0.9) + geom_point(size = 2) + scale_colour_manual(values = slam_cols, labels = slam_labels) + labs( title = "Median serve speed over time", x = NULL, y = "Median speed (km/h)", colour = NULL, linetype = NULL ) ``` ## Serve direction under pressure Does a server change their target when facing a break point? `ServeWidth` encodes: **W** (wide), **B/BC/BW** (body), **C** (centre/T). ```{r serve-direction} #| fig-height: 6 #| fig-width: 11 dir_df <- pts |> filter(!is.na(ServeWidth), ServeWidth != "", ServeIndicator == "1") |> mutate( direction = case_when( ServeWidth %in% c("W", "BW") ~ "Wide", ServeWidth %in% c("B", "BC") ~ "Body", ServeWidth == "C" ~ "Centre/T", TRUE ~ NA_character_ ), pressure = case_when( PointServer == 1 & P1BreakPoint == 1 ~ "Break point against server", PointServer == 2 & P2BreakPoint == 1 ~ "Break point against server", TRUE ~ "Normal point" ) ) |> filter(!is.na(direction)) |> count(slam, pressure, direction) |> group_by(slam, pressure) |> mutate(pct = n / sum(n), slam_label = slam_labels[slam]) |> ungroup() if (nrow(dir_df) > 0) { dir_df |> ggplot(aes(direction, pct, fill = pressure)) + geom_col(position = "dodge", width = 0.7) + scale_fill_manual(values = c("Normal point" = "#2171B5", "Break point against server" = "#CB4335")) + scale_y_continuous(labels = percent) + facet_wrap(~slam_label) + labs( title = "1st serve direction: normal vs break point", subtitle = "Does pressure shift the target?", x = NULL, y = "Share of 1st serves", fill = NULL ) } ``` ## Break point conversion ```{r bp-conversion} #| fig-height: 5 #| fig-width: 11 bp_data <- pts |> filter(!is.na(P1BreakPoint), !is.na(P2BreakPoint)) |> mutate( bp_opp = P1BreakPoint + P2BreakPoint, bp_won = P1BreakPointWon + P2BreakPointWon ) |> filter(bp_opp == 1) |> group_by(slam, year) |> summarise( conversion = mean(bp_won == 1, na.rm = TRUE), n_bp = n(), .groups = "drop" ) |> filter(n_bp >= 20) |> mutate(slam_label = slam_labels[slam]) if (nrow(bp_data) > 0) { bp_data |> ggplot(aes(year, conversion, colour = slam)) + geom_line(linewidth = 0.9) + geom_point(size = 2) + scale_colour_manual(values = slam_cols, labels = slam_labels) + scale_y_continuous(labels = percent) + labs( title = "Break point conversion rate over time", subtitle = "Share of break point opportunities converted", x = NULL, y = "Conversion rate", colour = NULL ) } ``` ```{r bp-by-slam} #| fig-height: 4 #| fig-width: 9 if (nrow(bp_data) > 0) { bp_data |> group_by(slam, slam_label) |> summarise(avg_conversion = mean(conversion), .groups = "drop") |> mutate(slam_label = fct_reorder(slam_label, avg_conversion)) |> ggplot(aes(avg_conversion, slam_label, fill = slam)) + geom_col(width = 0.6) + scale_fill_manual(values = slam_cols) + scale_x_continuous(labels = percent) + labs(title = "Average break point conversion by slam", x = "Conversion rate", y = NULL) + theme(legend.position = "none") } ``` ## Distance run: winners vs losers Do match winners consistently run more or less than their opponents? Distance run data reflects how much each player was pushed around the court. ```{r distance-run} #| fig-height: 5 #| fig-width: 11 # winner column in matches may be a player name or "1"/"2" — try both matches_winner <- matches_raw |> mutate(winner_int = suppressWarnings(as.integer(winner))) |> filter(!is.na(winner_int)) |> select(match_id, winner_int) dist_data <- pts |> filter(!is.na(P1DistanceRun), !is.na(P2DistanceRun), P1DistanceRun > 0, P2DistanceRun > 0) |> inner_join(matches_winner, by = "match_id") |> mutate( winner_dist = if_else(winner_int == 1L, P1DistanceRun, P2DistanceRun), loser_dist = if_else(winner_int == 1L, P2DistanceRun, P1DistanceRun) ) |> group_by(match_id, slam, year) |> summarise( winner_total = sum(winner_dist, na.rm = TRUE), loser_total = sum(loser_dist, na.rm = TRUE), .groups = "drop" ) |> filter(winner_total > 0, loser_total > 0) |> mutate( diff = winner_total - loser_total, slam_label = slam_labels[slam] ) if (nrow(dist_data) > 0) { dist_data |> pivot_longer(c(winner_total, loser_total), names_to = "result", values_to = "dist") |> mutate(result = if_else(result == "winner_total", "Match winner", "Match loser")) |> ggplot(aes(dist / 1000, fill = result)) + geom_density(alpha = 0.6, colour = NA) + scale_fill_manual(values = c("Match winner" = "#27AE60", "Match loser" = "#CB4335")) + scale_x_continuous(labels = \(x) paste0(x, "k")) + facet_wrap(~slam_label) + labs( title = "Total distance run per match: winners vs losers", subtitle = "Metres run by winner and loser across tracked matches", x = "Distance run (metres, thousands)", y = "Density", fill = NULL ) |> print() } else { message("Distance run data not available in this dataset.") } ``` ```{r distance-diff} #| fig-height: 4 #| fig-width: 9 if (nrow(dist_data) > 0) { dist_data |> ggplot(aes(diff, fill = slam)) + geom_vline(xintercept = 0, linetype = "dashed", colour = "grey50") + geom_density(alpha = 0.65, colour = NA) + scale_fill_manual(values = slam_cols, labels = slam_labels) + labs( title = "Winner's distance advantage per match", subtitle = "Positive = winner ran more; negative = winner ran less", x = "Winner metres − Loser metres", y = "Density", fill = NULL ) } ``` ## After a long point: does the loser fade? When a rally goes long, one player wins and one loses. Does the losing player carry any fatigue into the **next** point — or do elite players reset completely? The test: for every point following a rally of length ≥ threshold, compute the win rate of the player who **lost** the previous long point, split by whether they are serving or returning. Compare against their overall win rate in that role. ```{r fatigue-data} #| echo: false # Sort points within each match in rally order pts_sorted <- pts |> filter(!is.na(RallyCount), !is.na(PointWinner), !is.na(PointServer)) |> arrange(match_id, SetNo, GameNo, PointNumber) # For each point, tag the loser of the PREVIOUS point and its rally length pts_lagged <- pts_sorted |> group_by(match_id) |> mutate( prev_rally = lag(RallyCount), prev_winner = lag(PointWinner), # who won previous point (1 or 2) prev_server = lag(PointServer) # who served previous point ) |> ungroup() |> filter(!is.na(prev_rally), !is.na(prev_winner)) |> mutate( # loser of previous point prev_loser = if_else(prev_winner == 1L, 2L, 1L), # did the loser of previous point win this point? loser_won_next = as.integer(PointWinner == prev_loser), # role of that loser on THIS point prev_loser_role = case_when( prev_loser == PointServer ~ "Serving next", TRUE ~ "Returning next" ) ) # Baselines: overall win rate by role baseline <- pts_lagged |> group_by(prev_loser_role) |> summarise(baseline_win = mean(loser_won_next, na.rm = TRUE), .groups = "drop") # Fatigue effect by rally threshold thresholds <- c(5, 8, 10, 12, 15, 20) fatigue_df <- map_dfr(thresholds, function(thr) { pts_lagged |> filter(prev_rally >= thr) |> group_by(prev_loser_role) |> summarise( win_rate = mean(loser_won_next, na.rm = TRUE), n = n(), .groups = "drop" ) |> mutate(threshold = thr) }) |> left_join(baseline, by = "prev_loser_role") |> mutate(edge = win_rate - baseline_win) ``` ```{r fatigue-plot} #| fig-height: 5 #| fig-width: 10 if (nrow(fatigue_df) == 0) stop("No fatigue data available.") fatigue_df |> ggplot(aes(threshold, edge, colour = prev_loser_role)) + geom_hline(yintercept = 0, linetype = "dashed", colour = "grey50") + geom_line(linewidth = 1.1) + geom_point(aes(size = n)) + scale_colour_manual(values = c("Serving next" = "#2171B5", "Returning next" = "#CB4335")) + scale_size_continuous(range = c(2, 6), labels = comma) + scale_x_continuous(breaks = thresholds) + scale_y_continuous(labels = \(x) paste0(round(x * 100, 1), " pp")) + labs( title = "Long-point fatigue: does losing a long rally hurt the next point?", subtitle = "Win rate of previous-point loser on next point, relative to their baseline\nNegative = they win next point less often than expected", x = "Minimum rally length of previous point (shots)", y = "Win rate vs baseline (percentage points)", colour = "Role on next point", size = "Points (n)" ) ``` ```{r fatigue-by-slam} #| fig-height: 5 #| fig-width: 11 thr_main <- 10 # focus on rallies of 10+ shots pts_lagged |> filter(prev_rally >= thr_main) |> group_by(slam, prev_loser_role) |> summarise( win_rate = mean(loser_won_next, na.rm = TRUE), n = n(), .groups = "drop" ) |> left_join(baseline, by = "prev_loser_role") |> mutate( edge = win_rate - baseline_win, slam_label = slam_labels[slam] ) |> ggplot(aes(slam_label, edge, fill = slam)) + geom_col(width = 0.6) + geom_hline(yintercept = 0, linetype = "dashed") + scale_fill_manual(values = slam_cols) + scale_y_continuous(labels = \(x) paste0(round(x * 100, 1), " pp")) + facet_wrap(~prev_loser_role) + labs( title = "Fatigue effect by slam (rallies ≥ 10 shots)", subtitle = "Win rate of long-point loser on next point vs. baseline", x = NULL, y = "vs. baseline (pp)", fill = NULL ) + theme(legend.position = "none", axis.text.x = element_text(angle = 25, hjust = 1)) ``` ```{r fatigue-table} fatigue_df |> filter(threshold %in% c(5, 10, 15)) |> mutate( win_pct = percent(win_rate, accuracy = 0.1), baseline_pct = percent(baseline_win, accuracy = 0.1), edge_pp = paste0(round(edge * 100, 2), " pp"), n = comma(n) ) |> select( `Min rally` = threshold, Role = prev_loser_role, `Win rate (next point)` = win_pct, `Baseline win rate` = baseline_pct, `Difference` = edge_pp, `N` = n ) |> arrange(`Min rally`, Role) |> gt() |> tab_header( title = "Long-point fatigue effect", subtitle = "Win rate of previous-point loser on the immediately following point" ) |> tab_style( style = cell_text(color = "#CB4335", weight = "bold"), locations = cells_body( columns = Difference, rows = as.numeric(str_remove(Difference, " pp")) < 0 ) ) ``` ## Playstyle Clusters — Last 30 Years Every player has a statistical "recipe" — the blend of serving aggression, consistency, and defensive resilience that defines how they win matches. K-means clustering on six serve-and-match metrics groups ATP players (1996–2025, ≥ 50 matches) into four distinct archetypes. **Features:** | Metric | What it captures | |---|---| | Ace rate | Serving firepower | | Double-fault rate | Serve risk tolerance | | 1st serve in % | Serve consistency | | 1st serve win % | Dominance when 1st serve lands | | 2nd serve win % | Baseline / net quality under pressure | | BP save rate | Clutch performance defending break points | ```{r atp-load-clusters} #| echo: false atp_path <- here::here("data-projects", "data", "atp_matches.csv") atp_raw <- read_csv(atp_path, show_col_types = FALSE, col_types = cols(.default = "c")) |> mutate( year = as.integer(substr(tourney_date, 1, 4)), across(c(w_ace, w_df, w_svpt, w_1stIn, w_1stWon, w_2ndWon, w_bpSaved, w_bpFaced, l_ace, l_df, l_svpt, l_1stIn, l_1stWon, l_2ndWon, l_bpSaved, l_bpFaced), \(x) suppressWarnings(as.numeric(x))) ) |> filter(year >= 1996, !is.na(w_svpt), !is.na(l_svpt), w_svpt > 0, l_svpt > 0) ``` ```{r player-profiles} #| echo: false winner_rows <- atp_raw |> transmute(player = winner_name, won = 1L, svpt = w_svpt, ace = w_ace, df = w_df, first_in = w_1stIn, first_won = w_1stWon, second_won = w_2ndWon, bp_saved = w_bpSaved, bp_faced = w_bpFaced) loser_rows <- atp_raw |> transmute(player = loser_name, won = 0L, svpt = l_svpt, ace = l_ace, df = l_df, first_in = l_1stIn, first_won = l_1stWon, second_won = l_2ndWon, bp_saved = l_bpSaved, bp_faced = l_bpFaced) player_profiles <- bind_rows(winner_rows, loser_rows) |> filter(svpt > 0) |> group_by(player) |> summarise( n_matches = n(), win_pct = mean(won), ace_rate = sum(ace, na.rm = TRUE) / sum(svpt, na.rm = TRUE), df_rate = sum(df, na.rm = TRUE) / sum(svpt, na.rm = TRUE), first_in_pct = sum(first_in, na.rm = TRUE) / sum(svpt, na.rm = TRUE), first_win_pct = sum(first_won, na.rm = TRUE) / pmax(sum(first_in, na.rm = TRUE), 1), second_win_pct = sum(second_won, na.rm = TRUE) / pmax(sum(svpt, na.rm = TRUE) - sum(first_in, na.rm = TRUE), 1), bp_save_rate = sum(bp_saved, na.rm = TRUE) / pmax(sum(bp_faced, na.rm = TRUE), 1), .groups = "drop" ) |> filter(n_matches >= 50, first_win_pct > 0.01, second_win_pct > 0.01) ``` ```{r kmeans-setup} #| echo: false set.seed(42) feat_cols <- c("ace_rate", "df_rate", "first_in_pct", "first_win_pct", "second_win_pct", "bp_save_rate") feat_mat <- player_profiles |> select(all_of(feat_cols)) |> scale() wss_df <- tibble( k = 1:8, wss = map_dbl(1:8, \(k) kmeans(feat_mat, centers = k, nstart = 25)$tot.withinss) ) km <- kmeans(feat_mat, centers = 4, nstart = 50) # Name clusters: Big Server = highest ace rate, # Counter-Puncher = lowest ace rate, # Aggressive Baseliner = highest 2nd serve win% of the remaining two, # All-Court = remainder centroids <- as_tibble(km$centers) |> mutate(cluster = row_number()) big_srv_cl <- centroids$cluster[which.max(centroids$ace_rate)] counter_cl <- centroids$cluster[which.min(centroids$ace_rate)] remaining <- centroids |> filter(!cluster %in% c(big_srv_cl, counter_cl)) agg_cl <- remaining$cluster[which.max(remaining$second_win_pct)] allcrt_cl <- remaining$cluster[remaining$cluster != agg_cl] style_map <- tibble( cluster = c(big_srv_cl, counter_cl, agg_cl, allcrt_cl), style = c("Big Server", "Counter-Puncher", "Aggressive Baseliner", "All-Court") ) player_profiles <- player_profiles |> mutate(cluster = km$cluster) |> left_join(style_map, by = "cluster") cluster_cols <- c( "Big Server" = "#2171B5", "Counter-Puncher" = "#C8593C", "Aggressive Baseliner" = "#27AE60", "All-Court" = "#6B4F8B" ) feat_labels <- c( ace_rate = "Ace Rate", df_rate = "DF Rate", first_in_pct = "1st In%", first_win_pct = "1st Win%", second_win_pct = "2nd Win%", bp_save_rate = "BP Save%" ) ``` ```{r elbow-plot} #| fig-height: 4 #| fig-width: 7 wss_df |> ggplot(aes(k, wss)) + geom_line(linewidth = 1) + geom_point(size = 3, colour = "#2171B5") + geom_vline(xintercept = 4, linetype = "dashed", colour = "#CB4335", linewidth = 0.7) + scale_x_continuous(breaks = 1:8) + labs( title = "Choosing k: within-cluster sum of squares", subtitle = "Elbow at k = 4 — four distinct playstyle archetypes", x = "Number of clusters (k)", y = "Total within-cluster SS" ) ``` ```{r cluster-fingerprints} #| fig-height: 5 #| fig-width: 11 centroids |> left_join(style_map, by = "cluster") |> pivot_longer(all_of(feat_cols), names_to = "feature", values_to = "z") |> mutate(feature = factor(feat_labels[feature], levels = feat_labels)) |> ggplot(aes(feature, z, fill = style)) + geom_col(width = 0.7) + geom_hline(yintercept = 0, linetype = "dashed", colour = "grey60") + scale_fill_manual(values = cluster_cols) + facet_wrap(~style, nrow = 2) + labs( title = "Playstyle fingerprints — cluster centroids (z-scored)", subtitle = "Above zero = above average for that metric across all players", x = NULL, y = "Z-score vs. population mean", fill = NULL ) + theme(legend.position = "none", axis.text.x = element_text(angle = 30, hjust = 1)) ``` ```{r cluster-scatter} #| fig-height: 5 #| fig-width: 10 player_profiles |> ggplot(aes(ace_rate, second_win_pct, colour = style, size = n_matches)) + geom_point(alpha = 0.5) + scale_colour_manual(values = cluster_cols) + scale_x_continuous(labels = percent) + scale_y_continuous(labels = percent) + scale_size_continuous(range = c(1, 4), guide = "none") + labs( title = "Player playstyle clusters: Ace Rate vs 2nd Serve Win%", subtitle = "Dot size = career matches played · ATP 1996–2025", x = "Ace Rate", y = "2nd Serve Win%", colour = NULL ) ``` ```{r playstyle-win-table} player_profiles |> group_by(style) |> summarise( Players = n(), `Win %` = mean(win_pct), `Ace Rate` = mean(ace_rate), `DF Rate` = mean(df_rate), `1st In%` = mean(first_in_pct), `1st Win%` = mean(first_win_pct), `2nd Win%` = mean(second_win_pct), `BP Save%` = mean(bp_save_rate), .groups = "drop" ) |> arrange(desc(`Win %`)) |> gt() |> tab_header( title = "Playstyles ranked by win %", subtitle = "ATP 1996–2025 · players with ≥ 50 matches · k-means (k = 4)" ) |> fmt_percent( columns = c(`Win %`, `Ace Rate`, `DF Rate`, `1st In%`, `1st Win%`, `2nd Win%`, `BP Save%`), decimals = 1 ) |> fmt_number(columns = Players, decimals = 0) |> cols_label(style = "Playstyle") |> data_color( columns = `Win %`, palette = c("#FDECEA", "#A93226") ) |> tab_style( style = cell_text(weight = "bold"), locations = cells_body(rows = 1) ) |> tab_footnote( footnote = "Raw win % across all tour-level ATP matches; not adjusted for opposition strength.", locations = cells_column_labels(columns = `Win %`) ) ``` ```{r notable-players-table} player_profiles |> group_by(style) |> slice_max(win_pct, n = 6, with_ties = FALSE) |> ungroup() |> arrange(style, desc(win_pct)) |> select(style, player, n_matches, win_pct, ace_rate, second_win_pct, bp_save_rate) |> gt() |> tab_header( title = "Top players by playstyle", subtitle = "Highest win % within each cluster (min 50 matches · ATP 1996–2025)" ) |> fmt_percent(columns = c(win_pct, ace_rate, second_win_pct, bp_save_rate), decimals = 1) |> fmt_number(columns = n_matches, decimals = 0) |> cols_label( style = "Playstyle", player = "Player", n_matches = "Matches", win_pct = "Win %", ace_rate = "Ace Rate", second_win_pct = "2nd Win%", bp_save_rate = "BP Save%" ) |> tab_row_group(label = "Big Server", rows = style == "Big Server") |> tab_row_group(label = "Aggressive Baseliner", rows = style == "Aggressive Baseliner") |> tab_row_group(label = "Counter-Puncher", rows = style == "Counter-Puncher") |> tab_row_group(label = "All-Court", rows = style == "All-Court") |> tab_style( style = cell_fill(color = "#F4F4F4"), locations = cells_row_groups() ) |> data_color( columns = win_pct, palette = c("#FDECEA", "#A93226") ) ```