Code
set.seed(123) # For reproducibility
n <- 500 # Number of matches
# Simulate team skill levels (values between 50 and 100)
teamA_skill <- round(runif(n, 50, 100),1)
teamB_skill <- round(runif(n, 50, 100),1)
# Simulate home advantage (0 or 1) and match importance (categorical)
home_advantage <- rbinom(n, 1, 0.5)
match_importance <- factor(sample(c("low", "medium", "high"), n, replace = TRUE))
# Map match importance to a numeric effect
importance_effect <- ifelse(match_importance == "low", 0,
ifelse(match_importance == "medium", 0.3, 0.6))
# Define a linear predictor that favors Team A based on skills, home advantage, and match importance
lin_pred <- (teamA_skill - teamB_skill) / 10 + 0.5 * home_advantage + importance_effect
prob <- 1 / (1 + exp(-lin_pred)) # Logistic transformation
# Generate binary outcome (1 = Team A wins, 0 = loses)
result <- rbinom(n, 1, prob)
# Create the data frame
sports_data <- data.frame(
teamA_skill,
teamB_skill,
home_advantage = factor(home_advantage),
match_importance,
result = factor(result)
)
set.seed(456)
train_indices <- sample(1:n, size = round(0.7 * n))
train_data <- sports_data[train_indices, ]
test_data <- sports_data[-train_indices, ]
## Generalised Linear Model (GLM)
glm_model <- glm(result ~ teamA_skill + teamB_skill + home_advantage + match_importance,
data = train_data, family = binomial)
summary(glm_model)
Call:
glm(formula = result ~ teamA_skill + teamB_skill + home_advantage +
match_importance, family = binomial, data = train_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.16348 0.97997 1.187 0.23512
teamA_skill 0.10117 0.01297 7.802 6.07e-15 ***
teamB_skill -0.10925 0.01293 -8.450 < 2e-16 ***
home_advantage1 0.77300 0.29104 2.656 0.00791 **
match_importancelow -0.51460 0.35557 -1.447 0.14783
match_importancemedium -0.82500 0.35909 -2.297 0.02159 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 479.66 on 349 degrees of freedom
Residual deviance: 301.69 on 344 degrees of freedom
AIC: 313.69
Number of Fisher Scoring iterations: 5
Code
GLM Accuracy: 0.7866667
Code
## Regularised Linear Model (Lasso)
library(glmnet)
# Create model matrices with dummy variables for factors
x_train <- model.matrix(result ~ teamA_skill + teamB_skill + home_advantage + match_importance, train_data)[,-1]
y_train <- as.numeric(as.character(train_data$result))
x_test <- model.matrix(result ~ teamA_skill + teamB_skill + home_advantage + match_importance, test_data)[,-1]
# Cross-validation to select lambda (regularisation strength)
cv_glmnet <- cv.glmnet(x_train, y_train, family = "binomial", alpha = 1)
glmnet_preds <- predict(cv_glmnet, newx = x_test, type = "response", s = "lambda.min")
glmnet_class <- ifelse(glmnet_preds > 0.5, 1, 0)
glmnet_accuracy <- mean(glmnet_class == as.numeric(as.character(test_data$result)))
cat("Regularised GLM (Lasso) Accuracy:", glmnet_accuracy, "\n")
Regularised GLM (Lasso) Accuracy: 0.78
Code
## Random Forest
library(randomForest)
rf_model <- randomForest(result ~ teamA_skill + teamB_skill + home_advantage + match_importance,
data = train_data, ntree = 100)
rf_preds <- predict(rf_model, test_data)
rf_accuracy <- mean(rf_preds == test_data$result)
cat("Random Forest Accuracy:", rf_accuracy, "\n")
Random Forest Accuracy: 0.8
Code
## Support Vector Machines (SVM)
library(e1071)
svm_model <- svm(result ~ teamA_skill + teamB_skill + home_advantage + match_importance,
data = train_data, probability = TRUE)
svm_preds <- predict(svm_model, test_data, probability = TRUE)
svm_accuracy <- mean(svm_preds == test_data$result)
cat("SVM Accuracy:", svm_accuracy, "\n")
SVM Accuracy: 0.7533333