import os
import kagglehub
import numpy as np
import pandas as pd
from scipy import stats# Download data from kaggle
path = kagglehub.dataset_download("yashdevladdha/uber-ride-analytics-dashboard")
# Load data into Pandas DataFrame
csv_file = os.path.join(path, "ncr_ride_bookings.csv")
df = pd.read_csv(csv_file)
print("✅ Data loaded successfully!")✅ Data loaded successfully!
✍️¶
Implementeer de (empirische) PMF van de variabele Payment Method.
pmf_dict = df["Payment Method"].value_counts(normalize=True).to_dict()
print(pmf_dict)
def pmf(payment_method):
return pmf_dict.get(payment_method){'UPI': 0.4500882352941176, 'Cash': 0.24869607843137254, 'Uber Wallet': 0.1203529411764706, 'Credit Card': 0.10008823529411764, 'Debit Card': 0.08077450980392156}
✍️¶
Bereken de kans met = Cash en = Card.
pmf("Cash") + pmf("Credit Card") + pmf("Debit Card")0.42955882352941177✍️¶
Gegeven een uniforme verdeling op interval , bereken manueel de kans dat .
a, b = -3.1, 5.5
interval_start, interval_end = -0.1, 0.6
p_interval_manual = (interval_end - interval_start) / (b - a)
print(p_interval_manual)0.0813953488372093
normal = stats.norm(loc=0.6, scale=2.2)
# 1. P(x ≤ 0)
p1 = normal.cdf(0)
print(f"1. P(x ≤ 0) = {p1:.4f}")
# 2. P(-1 ≤ x ≤ 1)
p2 = normal.cdf(1) - normal.cdf(-1)
print(f"2. P(-1 ≤ x ≤ 1) = {p2:.4f}")
# 3. P(x > 2)
p3 = 1 - normal.cdf(2)
print(f"3. P(x > 2) = {p3:.4f}")1. P(x ≤ 0) = 0.3925
2. P(-1 ≤ x ≤ 1) = 0.3386
3. P(x > 2) = 0.2623
✍️¶
Construeer de joint PMF voor Payment Method en Vehicle Type.
pmf_dict = df[["Payment Method", "Vehicle Type"]].value_counts(normalize=True).to_dict()
print(pmf_dict)
def pmf(payment_method, vehicle_type):
return pmf_dict.get((payment_method, vehicle_type)){('UPI', 'Auto'): 0.11113725490196079, ('UPI', 'Go Mini'): 0.09074509803921568, ('UPI', 'Go Sedan'): 0.07993137254901961, ('UPI', 'Bike'): 0.06829411764705882, ('Cash', 'Auto'): 0.061294117647058825, ('UPI', 'Premier Sedan'): 0.05464705882352941, ('Cash', 'Go Mini'): 0.050362745098039216, ('Cash', 'Go Sedan'): 0.04551960784313726, ('Cash', 'Bike'): 0.037784313725490196, ('UPI', 'eBike'): 0.03230392156862745, ('Uber Wallet', 'Auto'): 0.031, ('Cash', 'Premier Sedan'): 0.029313725490196077, ('Credit Card', 'Auto'): 0.025176470588235293, ('Uber Wallet', 'Go Mini'): 0.0231078431372549, ('Uber Wallet', 'Go Sedan'): 0.021254901960784313, ('Debit Card', 'Auto'): 0.020558823529411765, ('Credit Card', 'Go Mini'): 0.019666666666666666, ('Credit Card', 'Go Sedan'): 0.018470588235294117, ('Uber Wallet', 'Bike'): 0.017578431372549018, ('Cash', 'eBike'): 0.01673529411764706, ('Debit Card', 'Go Mini'): 0.01576470588235294, ('Credit Card', 'Bike'): 0.014970588235294117, ('Uber Wallet', 'Premier Sedan'): 0.014745098039215686, ('Debit Card', 'Go Sedan'): 0.014411764705882353, ('UPI', 'Uber XL'): 0.013029411764705883, ('Credit Card', 'Premier Sedan'): 0.01215686274509804, ('Debit Card', 'Bike'): 0.011980392156862744, ('Debit Card', 'Premier Sedan'): 0.009872549019607843, ('Uber Wallet', 'eBike'): 0.008872549019607844, ('Cash', 'Uber XL'): 0.007686274509803921, ('Credit Card', 'eBike'): 0.00692156862745098, ('Debit Card', 'eBike'): 0.005568627450980392, ('Uber Wallet', 'Uber XL'): 0.0037941176470588237, ('Credit Card', 'Uber XL'): 0.0027254901960784314, ('Debit Card', 'Uber XL'): 0.0026176470588235292}
p1 = pmf("Debit Card", "Bike")
print(f"P(Debit Card, Bike) = {p1:.4f}")
p2 = pmf("Cash", "Uber XL") + pmf("Cash", "Premier Sedan")
print(f"P(Cash, Uber XL or Premier Sedan) = {p2:.4f}")
p3 = sum([pmf(method, "eBike") for method in df["Payment Method"].dropna().unique()])
print(f"P(eBike) = {p3:.4f}")P(Debit Card, Bike) = 0.0120
P(Cash, Uber XL or Premier Sedan) = 0.0370
P(eBike) = 0.0704
✍️¶
Vergelijk met .
marginal_ps = {
method: sum(pmf(method, vehicle_type) for vehicle_type in df["Vehicle Type"].dropna().unique())
for method in df["Payment Method"].dropna().unique()
}
conditional_ps = {
method: sum(pmf(method, vehicle_type) for vehicle_type in ["eBike", "Uber XL"])
for method in df["Payment Method"].dropna().unique()
}
for method, p_marginal in marginal_ps.items():
print(f"{method}:")
print(f"\tmarginal: {p_marginal:.4f}")
p_conditional = conditional_ps.get(method)
print(f"\tconditional: {p_conditional:.4f}")UPI:
marginal: 0.4501
conditional: 0.0453
Debit Card:
marginal: 0.0808
conditional: 0.0082
Cash:
marginal: 0.2487
conditional: 0.0244
Uber Wallet:
marginal: 0.1204
conditional: 0.0127
Credit Card:
marginal: 0.1001
conditional: 0.0096
✍️¶
Stel dat we 30 nieuwe observaties zouden maken, wat is de verwachtte hoeveelheid Uber XLs en eBike op basis van de empirische PMF van Vehicle Type?
pmf_dict = df["Vehicle Type"].value_counts(normalize=True).to_dict()
n = 30
n_xl = int(np.round(pmf_dict.get("Uber XL") * n))
n_eb = int(np.round(pmf_dict.get("eBike") * n))
print(f"# Uber XL: {n_xl}")
print(f"# eBike: {n_eb}")# Uber XL: 1
# eBike: 2
✍️¶
Bereken de eenzijdige p-waarde van de nulhypothese voor volgende gewichten (in gram): 450, 530, 505, 600
mu = 500
sigma = 25
for x in [450, 530, 505, 600]:
p_right = 1 - stats.norm.cdf(x, mu, sigma)
p_left = stats.norm.cdf(x, mu, sigma)
if x < mu:
print(f"x = {x:3d}: P(X≤x) = {p_left:.4f}")
else:
print(f"x = {x:3d}: P(X≥x) = {p_right:.4f}")x = 450: P(X≤x) = 0.0228
x = 530: P(X≥x) = 0.1151
x = 505: P(X≥x) = 0.4207
x = 600: P(X≥x) = 0.0000
✍️¶
Een smartphone fabrikant adverteert dat de batterij gemiddeld 24 uur meegaat. Een test van 22 toestellen geeft de volgende batterijduren (in uren):
[23.2, 25.1, 22.8, 24.5, 23.7, 25.3, 22.5, 26.1, 24.0, 23.5, 24.8, 23.1, 25.0, 24.2, 23.9, 22.7, 25.5, 24.1, 23.6, 24.7, 23.4, 24.9]
Test of de gemiddelde batterijduur significant afwijkt van 24 uur (α = 0.05). Doe dit via manuele berekening van de teststatistiek.
# Data
batt = np.array(
[
23.2,
25.1,
22.8,
24.5,
23.7,
25.3,
22.5,
26.1,
24.0,
23.5,
24.8,
23.1,
25.0,
24.2,
23.9,
22.7,
25.5,
24.1,
23.6,
24.7,
23.4,
24.9,
]
)
mu_0 = 24 # Hypothesized mean
# Calculate sample statistics
sample_mean = batt.mean()
sample_std = batt.std(ddof=1) # N-1 for sample standard deviation
n = len(batt)
se = sample_std / np.sqrt(n) # Standard error
# Calculate t-statistic manually
t_statistic_manual = (sample_mean - mu_0) / se
# Degrees of freedom
df = n - 1
# Calculate p-value (two-tailed test)
p_value_manual = 2 * (1 - stats.t.cdf(abs(t_statistic_manual), df))
print("Manual")
print(f"t-statistic: {t_statistic_manual:.4f}")
print(f"p-value (two-tailed): {p_value_manual:.4f}")Manual
t-statistic: 0.5653
p-value (two-tailed): 0.5779
✍️¶
Een online leerplatform test een nieuwe onderwijsmethode. Ze vergelijken de eindscores (op 100) van twee groepen studenten:
Traditionele methode:
[72, 68, 75, 71, 69, 74, 70, 73, 68, 71, 72, 70, 74, 69, 71]Nieuwe methode:
[78, 82, 76, 80, 79, 81, 77, 83, 79, 80, 78, 81, 82, 79, 80]
Test of de nieuwe methode tot significant hogere scores leidt (eenzijdige test, α = 0.05) - in de veronderstelling van een gezamenlijke sample variantie. Doe dit via manuele berekening van de teststatistiek.
# Data
control = np.array([72, 68, 75, 71, 69, 74, 70, 73, 68, 71, 72, 70, 74, 69, 71])
treatment = np.array([78, 82, 76, 80, 79, 81, 77, 83, 79, 80, 78, 81, 82, 79, 80])
# Manual calculation
n1 = len(control)
n2 = len(treatment)
mean1 = control.mean()
mean2 = treatment.mean()
std1 = control.std(ddof=1)
std2 = treatment.std(ddof=1)
# Pooled standard deviation (assuming equal variances)
pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))
# Standard error of difference
se_diff = pooled_std * np.sqrt(1 / n1 + 1 / n2)
# t-statistic
t_stat_manual = (mean1 - mean2) / se_diff
# Degrees of freedom
df_manual = n1 + n2 - 2
# p-value for one-sided test (control > treatment)
p_value_manual = 1 - stats.t.cdf(t_stat_manual, df_manual)
print("Manual")
print(f"t-statistic: {t_stat_manual:.4f}")
print(f"p-value: {p_value_manual:.4f}")Manual
t-statistic: -11.2387
p-value: 1.0000