Lineaire Regressie - Machine Learning Cursussen

import os

import kagglehub
import numpy as np
import pandas as pd
import torch
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Download data from kaggle
path = kagglehub.dataset_download("yashdevladdha/uber-ride-analytics-dashboard")

# Load data into Pandas DataFrame
csv_file = os.path.join(path, "ncr_ride_bookings.csv")
df = pd.read_csv(csv_file)
print("✅ Data loaded successfully!")

✅ Data loaded successfully!

Target variabele $\pmb{y}$ :
- Avg CTAT
Features $\pmb{X}$ :
- Avg VTAT
- Booking Value
- Ride Distance

df = (
    df.loc[:, ("Avg CTAT", "Avg VTAT", "Booking Value", "Ride Distance")]
    .dropna()
    .reset_index(drop=True)
)

✍️¶

Maak NumPy array aan voor de target $\pmb{y}$ mèt standaardschaling.

y = df["Avg CTAT"].values

y_mean = np.mean(y)
y_std = np.std(y)
y_scaled = (y - y_mean) / y_std

✍️¶

Maak NumPy array aan voor de featurematrix $\pmb{X}$ mèt standaardschaling.

scaler = StandardScaler()
X = df[["Avg VTAT", "Booking Value", "Ride Distance"]].values
X_scaled = scaler.fit_transform(X)

# add a bias term
X_scaled = np.column_stack([np.ones(X_scaled.shape[0]), X_scaled])

✍️¶

Maak random 80/20% train/test split van $\pmb{y}$ en $\pmb{X}$ .

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.2, random_state=42
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")

Training set size: 81600
Test set size: 20400
Training set shape: X=(81600, 4), y=(81600,)
Test set shape: X=(20400, 4), y=(20400,)

✍️¶

Hoe is het gesteld met het conditienummer van de training design matrix en de multicollineariteit bij de predictoren?

condition_number = np.linalg.cond(X_train)
print(f"Condition number of training design matrix: {condition_number:.2f}")
print("\nInterpretation:")
if condition_number < 10:
    print("- Excellent: Matrix is well-conditioned")
elif condition_number < 100:
    print("- Good: Matrix is reasonably well-conditioned")
elif condition_number < 1000:
    print("- Fair: Some numerical instability may occur")
else:
    print("- Poor: Matrix is ill-conditioned, results may be unreliable")

Condition number of training design matrix: 1.07

Interpretation:
- Excellent: Matrix is well-conditioned

✍️¶

Implementeer de manuele analytische oplossing voor de parameters.

def closed_form_ols(X, y):
    """
    Compute OLS coefficients using the closed-form solution.

    Parameters
    ----------
    X : ndarray, shape (n_samples, n_features)
        Design matrix including bias term
    y : ndarray, shape (n_samples,)
        Target vector

    Returns
    -------
    beta : ndarray, shape (n_features,)
        Estimated coefficients
    """
    XtX = X.T @ X
    Xty = X.T @ y
    beta = np.linalg.inv(XtX) @ Xty
    return beta


# Calculate coefficients
beta_closed = closed_form_ols(X_train, y_train)

print("Closed-form OLS coefficients:")
print(f"  β₀ (bias):        {beta_closed[0]:.6f}")
print(f"  β₁ (Avg VTAT):    {beta_closed[1]:.6f}")
print(f"  β₂ (Booking Value):    {beta_closed[2]:.6f}")
print(f"  β₃ (Ride Distance): {beta_closed[3]:.6f}")

Closed-form OLS coefficients:
  β₀ (bias):        0.001998
  β₁ (Avg VTAT):    0.055260
  β₂ (Booking Value):    0.000572
  β₃ (Ride Distance): 0.099276

✍️¶

Bereken de $R^2$ score op de test data.

from sklearn.metrics import r2_score

y_pred_test = X_test @ beta_closed
r2_closed = r2_score(y_test, y_pred_test)

print(f"\nClosed-form OLS R² on test set: {r2_closed:.4f}")


Closed-form OLS R² on test set: 0.0126

✍️¶

Implementeer de manuele gradient descent oplossing voor de parameters.

b = torch.tensor([0.0, 0.0, 0.0, 0.0], requires_grad=True, dtype=torch.float32)

X_tensor = torch.tensor(X_train, dtype=torch.float32)

y_tensor = torch.tensor(y_train, dtype=torch.float32)

# Training loop
n_iterations = 1000
learning_rate = 0.01
loss_history = []

for i in range(n_iterations):
    # Forward pass: compute predictions
    y_pred = X_tensor @ b

    # Compute loss (Mean Squared Error)
    loss = torch.mean((y_tensor - y_pred) ** 2)

    # Backward pass: compute gradients (autograd!)
    loss.backward()  # Compute gradients via backpropagation

    # Update parameters
    with torch.no_grad():  # Disable gradient tracking for parameter update
        b -= learning_rate * b.grad

    # Zero gradients for next iteration (crucial!)
    b.grad.zero_()

    # Store loss for visualization
    loss_history.append(loss.item())

    if (i + 1) % 100 == 0 or i == 0:
        print(f"Iteration {i + 1}/{n_iterations}, Loss: {loss.item():.6f}")

# Extract learned parameters
print("\nFinal parameters: {b}")
print("\nComparison with closed-form OLS:")
print(f"  β₀: GD={b[0].item():.6f}, OLS={beta_closed[0]:.6f}")
print(f"  β₁: GD={b[1].item():.6f}, OLS={beta_closed[1]:.6f}")
print(f"  β₂: GD={b[2].item():.6f}, OLS={beta_closed[2]:.6f}")
print(f"  β₃: GD={b[3].item():.6f}, OLS={beta_closed[3]:.6f}")

Iteration 1/1000, Loss: 0.999292

Iteration 100/1000, Loss: 0.985887
Iteration 200/1000, Loss: 0.985688
Iteration 300/1000, Loss: 0.985685
Iteration 400/1000, Loss: 0.985685
Iteration 500/1000, Loss: 0.985685
Iteration 600/1000, Loss: 0.985685
Iteration 700/1000, Loss: 0.985685
Iteration 800/1000, Loss: 0.985685
Iteration 900/1000, Loss: 0.985685
Iteration 1000/1000, Loss: 0.985685

Final parameters: {b}

Comparison with closed-form OLS:
  β₀: GD=0.001998, OLS=0.001998
  β₁: GD=0.055260, OLS=0.055260
  β₂: GD=0.000572, OLS=0.000572
  β₃: GD=0.099276, OLS=0.099276

✍️¶

Implementeer de manuele maximum likelihood schatting voor de normaal verdeelde ruis parameter.

residuals = y_train - X_train @ beta_closed

N = len(y_train)
M = 4  # number of parameters (intercept + 3 slopes)
sigma_hat = np.sqrt(np.sum(residuals**2) / (N - M))
print(f"\nEstimated standard deviation of residuals: {sigma_hat:.6f}")


Estimated standard deviation of residuals: 0.992841

✍️¶

Bereken de 95% betrouwbaarheidsintervallen voor $b_0$ , $b_1$ , $b_2$ en $b_3$ .

# Calculate standard errors
# SE(b_j) = σ * sqrt([(X^T X)^{-1}]_{jj})
XtX_inv = np.linalg.inv(X_train.T @ X_train)
se_b = sigma_hat * np.sqrt(np.diag(XtX_inv))

# 95% confidence intervals using t-distribution
alpha = 0.05
t_critical = stats.t.ppf(1 - alpha / 2, df=N - M)

ci_lower = beta_closed - t_critical * se_b
ci_upper = beta_closed + t_critical * se_b

print("\n95% Confidence Intervals:")
print(f"  b0: [{ci_lower[0]:.3f}, {ci_upper[0]:.3f}]")
print(f"  b1: [{ci_lower[1]:.3f}, {ci_upper[1]:.3f}]")
print(f"  b2: [{ci_lower[2]:.3f}, {ci_upper[2]:.3f}]")
print(f"  b3: [{ci_lower[3]:.3f}, {ci_upper[3]:.3f}]")


95% Confidence Intervals:
  b0: [-0.005, 0.009]
  b1: [0.048, 0.062]
  b2: [-0.006, 0.007]
  b3: [0.092, 0.106]