import os
import kagglehub
import numpy as np
import pandas as pd
import torch
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler# Download data from kaggle
path = kagglehub.dataset_download("yashdevladdha/uber-ride-analytics-dashboard")
# Load data into Pandas DataFrame
csv_file = os.path.join(path, "ncr_ride_bookings.csv")
df = pd.read_csv(csv_file)
print("✅ Data loaded successfully!")✅ Data loaded successfully!
Target variabele :
Avg CTAT
Features :
Avg VTATBooking ValueRide Distance
df = (
df.loc[:, ("Avg CTAT", "Avg VTAT", "Booking Value", "Ride Distance")]
.dropna()
.reset_index(drop=True)
)✍️¶
Maak NumPy array aan voor de target mèt standaardschaling.
y = df["Avg CTAT"].values
y_mean = np.mean(y)
y_std = np.std(y)
y_scaled = (y - y_mean) / y_std✍️¶
Maak NumPy array aan voor de featurematrix mèt standaardschaling.
scaler = StandardScaler()
X = df[["Avg VTAT", "Booking Value", "Ride Distance"]].values
X_scaled = scaler.fit_transform(X)# add a bias term
X_scaled = np.column_stack([np.ones(X_scaled.shape[0]), X_scaled])✍️¶
Maak random 80/20% train/test split van en .
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y_scaled, test_size=0.2, random_state=42
)
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")Training set size: 81600
Test set size: 20400
Training set shape: X=(81600, 4), y=(81600,)
Test set shape: X=(20400, 4), y=(20400,)
✍️¶
Hoe is het gesteld met het conditienummer van de training design matrix en de multicollineariteit bij de predictoren?
condition_number = np.linalg.cond(X_train)
print(f"Condition number of training design matrix: {condition_number:.2f}")
print("\nInterpretation:")
if condition_number < 10:
print("- Excellent: Matrix is well-conditioned")
elif condition_number < 100:
print("- Good: Matrix is reasonably well-conditioned")
elif condition_number < 1000:
print("- Fair: Some numerical instability may occur")
else:
print("- Poor: Matrix is ill-conditioned, results may be unreliable")Condition number of training design matrix: 1.07
Interpretation:
- Excellent: Matrix is well-conditioned
✍️¶
Implementeer de manuele analytische oplossing voor de parameters.
def closed_form_ols(X, y):
"""
Compute OLS coefficients using the closed-form solution.
Parameters
----------
X : ndarray, shape (n_samples, n_features)
Design matrix including bias term
y : ndarray, shape (n_samples,)
Target vector
Returns
-------
beta : ndarray, shape (n_features,)
Estimated coefficients
"""
XtX = X.T @ X
Xty = X.T @ y
beta = np.linalg.inv(XtX) @ Xty
return beta
# Calculate coefficients
beta_closed = closed_form_ols(X_train, y_train)
print("Closed-form OLS coefficients:")
print(f" β₀ (bias): {beta_closed[0]:.6f}")
print(f" β₁ (Avg VTAT): {beta_closed[1]:.6f}")
print(f" β₂ (Booking Value): {beta_closed[2]:.6f}")
print(f" β₃ (Ride Distance): {beta_closed[3]:.6f}")Closed-form OLS coefficients:
β₀ (bias): 0.001998
β₁ (Avg VTAT): 0.055260
β₂ (Booking Value): 0.000572
β₃ (Ride Distance): 0.099276
✍️¶
Bereken de score op de test data.
from sklearn.metrics import r2_score
y_pred_test = X_test @ beta_closed
r2_closed = r2_score(y_test, y_pred_test)
print(f"\nClosed-form OLS R² on test set: {r2_closed:.4f}")
Closed-form OLS R² on test set: 0.0126
✍️¶
Implementeer de manuele gradient descent oplossing voor de parameters.
b = torch.tensor([0.0, 0.0, 0.0, 0.0], requires_grad=True, dtype=torch.float32)
X_tensor = torch.tensor(X_train, dtype=torch.float32)
y_tensor = torch.tensor(y_train, dtype=torch.float32)
# Training loop
n_iterations = 1000
learning_rate = 0.01
loss_history = []
for i in range(n_iterations):
# Forward pass: compute predictions
y_pred = X_tensor @ b
# Compute loss (Mean Squared Error)
loss = torch.mean((y_tensor - y_pred) ** 2)
# Backward pass: compute gradients (autograd!)
loss.backward() # Compute gradients via backpropagation
# Update parameters
with torch.no_grad(): # Disable gradient tracking for parameter update
b -= learning_rate * b.grad
# Zero gradients for next iteration (crucial!)
b.grad.zero_()
# Store loss for visualization
loss_history.append(loss.item())
if (i + 1) % 100 == 0 or i == 0:
print(f"Iteration {i + 1}/{n_iterations}, Loss: {loss.item():.6f}")
# Extract learned parameters
print("\nFinal parameters: {b}")
print("\nComparison with closed-form OLS:")
print(f" β₀: GD={b[0].item():.6f}, OLS={beta_closed[0]:.6f}")
print(f" β₁: GD={b[1].item():.6f}, OLS={beta_closed[1]:.6f}")
print(f" β₂: GD={b[2].item():.6f}, OLS={beta_closed[2]:.6f}")
print(f" β₃: GD={b[3].item():.6f}, OLS={beta_closed[3]:.6f}")Iteration 1/1000, Loss: 0.999292
Iteration 100/1000, Loss: 0.985887
Iteration 200/1000, Loss: 0.985688
Iteration 300/1000, Loss: 0.985685
Iteration 400/1000, Loss: 0.985685
Iteration 500/1000, Loss: 0.985685
Iteration 600/1000, Loss: 0.985685
Iteration 700/1000, Loss: 0.985685
Iteration 800/1000, Loss: 0.985685
Iteration 900/1000, Loss: 0.985685
Iteration 1000/1000, Loss: 0.985685
Final parameters: {b}
Comparison with closed-form OLS:
β₀: GD=0.001998, OLS=0.001998
β₁: GD=0.055260, OLS=0.055260
β₂: GD=0.000572, OLS=0.000572
β₃: GD=0.099276, OLS=0.099276
✍️¶
Implementeer de manuele maximum likelihood schatting voor de normaal verdeelde ruis parameter.
residuals = y_train - X_train @ beta_closed
N = len(y_train)
M = 4 # number of parameters (intercept + 3 slopes)
sigma_hat = np.sqrt(np.sum(residuals**2) / (N - M))
print(f"\nEstimated standard deviation of residuals: {sigma_hat:.6f}")
Estimated standard deviation of residuals: 0.992841
✍️¶
Bereken de 95% betrouwbaarheidsintervallen voor , , en .
# Calculate standard errors
# SE(b_j) = σ * sqrt([(X^T X)^{-1}]_{jj})
XtX_inv = np.linalg.inv(X_train.T @ X_train)
se_b = sigma_hat * np.sqrt(np.diag(XtX_inv))
# 95% confidence intervals using t-distribution
alpha = 0.05
t_critical = stats.t.ppf(1 - alpha / 2, df=N - M)
ci_lower = beta_closed - t_critical * se_b
ci_upper = beta_closed + t_critical * se_b
print("\n95% Confidence Intervals:")
print(f" b0: [{ci_lower[0]:.3f}, {ci_upper[0]:.3f}]")
print(f" b1: [{ci_lower[1]:.3f}, {ci_upper[1]:.3f}]")
print(f" b2: [{ci_lower[2]:.3f}, {ci_upper[2]:.3f}]")
print(f" b3: [{ci_lower[3]:.3f}, {ci_upper[3]:.3f}]")
95% Confidence Intervals:
b0: [-0.005, 0.009]
b1: [0.048, 0.062]
b2: [-0.006, 0.007]
b3: [0.092, 0.106]