import numpy as np
import matplotlib.pyplot as plt

def regression_r2_analysis(x, y, title="Regression Example"):
    x = np.array(x, dtype=float)
    y = np.array(y, dtype=float)
    
    n = len(x)
    x_bar = np.mean(x)
    y_bar = np.mean(y)
    
    # OLS estimators
    Sxx = np.sum((x - x_bar)**2)
    Sxy = np.sum((x - x_bar)*(y - y_bar))
    
    b1 = Sxy / Sxx
    b0 = y_bar - b1 * x_bar
    
    # fitted values
    y_hat = b0 + b1 * x
    
    # sums of squares
    SST = np.sum((y - y_bar)**2)
    SSE = np.sum((y - y_hat)**2)
    SSR = np.sum((y_hat - y_bar)**2)
    
    R2 = SSR / SST
    
    # print results
    print(title)
    print("-" * len(title))
    print(f"Intercept b0 = {b0:.4f}")
    print(f"Slope b1     = {b1:.4f}")
    print(f"SST          = {SST:.4f}")
    print(f"SSE          = {SSE:.4f}")
    print(f"SSR          = {SSR:.4f}")
    print(f"R^2          = {R2:.4f}")
    print(f"Check: 1 - SSE/SST = {1 - SSE/SST:.4f}")
    
    # plot
    x_line = np.linspace(np.min(x), np.max(x), 200)
    y_line = b0 + b1 * x_line
    
    plt.figure(figsize=(7,5))
    plt.scatter(x, y, label="Data")
    plt.plot(x_line, y_line, label="Least Squares Line")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.title(f"{title}\n$R^2 = {R2:.4f}$")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    return {
        "b0": b0,
        "b1": b1,
        "SST": SST,
        "SSE": SSE,
        "SSR": SSR,
        "R2": R2
    }


x1 = [1, 2, 3, 4, 5, 6, 7, 8]
y1 = [2.1, 4.2, 6.1, 8.3, 9.9, 12.2, 13.8, 16.1]

res1 = regression_r2_analysis(x1, y1, title="Example 1: Strong Linear Relationship")

Example 1: Strong Linear Relationship
-------------------------------------
Intercept b0 = 0.2000
Slope b1     = 1.9750
SST          = 163.9888
SSE          = 0.1625
SSR          = 163.8263
R^2          = 0.9990
Check: 1 - SSE/SST = 0.9990


x3 = [1, 2, 3, 4, 5, 6, 7, 8]
y3 = [5.2, 7.8, 4.9, 6.1, 5.7, 7.0, 5.4, 6.3]

res3 = regression_r2_analysis(x3, y3, title="Example 3: Weak Linear Relationship")

Example 3: Weak Linear Relationship
-----------------------------------
Intercept b0 = 5.9643
Slope b1     = 0.0190
SST          = 6.6200
SSE          = 6.6048
SSR          = 0.0152
R^2          = 0.0023
Check: 1 - SSE/SST = 0.0023


import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import t, f


# Sample data
x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=float)
y = np.array([2.3, 2.9, 3.8, 5.1, 5.0, 6.4, 7.1, 7.9, 9.2, 9.8], dtype=float)

n = len(x)
x_bar = np.mean(x)
y_bar = np.mean(y)

# OLS estimates
Sxx = np.sum((x - x_bar)**2)
Sxy = np.sum((x - x_bar)*(y - y_bar))

b1 = Sxy / Sxx
b0 = y_bar - b1 * x_bar

# Fitted values and residuals
y_hat = b0 + b1 * x
residuals = y - y_hat

# SSE and variance estimate
SSE = np.sum(residuals**2)
s2 = SSE / (n - 2)
s = np.sqrt(s2)

print(f"b0 = {b0:.4f}")
print(f"b1 = {b1:.4f}")
print(f"SSE = {SSE:.4f}")
print(f"s^2 = {s2:.4f}")
print(f"s = {s:.4f}")

b0 = 1.3000
b1 = 0.8455
SSE = 0.6145
s^2 = 0.0768
s = 0.2772


# Choose the point x0 where we want pointwise intervals
x0 = 4.5

# Predicted mean at x0
y0_hat = b0 + b1 * x0

# Standard errors at x0
SE_mean_x0 = np.sqrt(s2 * (1/n + (x0 - x_bar)**2 / Sxx))
SE_pred_x0 = np.sqrt(s2 * (1 + 1/n + (x0 - x_bar)**2 / Sxx))

# Quantiles
alpha = 0.001
t_crit = t.ppf(1 - alpha/2, df=n-2)
W = np.sqrt(2 * f.ppf(1 - alpha, dfn=2, dfd=n-2))

# Pointwise confidence interval for mean response at x0
ci_mean_lower = y0_hat - t_crit * SE_mean_x0
ci_mean_upper = y0_hat + t_crit * SE_mean_x0

# Pointwise prediction interval for a new observation at x0
pi_lower = y0_hat - t_crit * SE_pred_x0
pi_upper = y0_hat + t_crit * SE_pred_x0

print(f"x0 = {x0}")
print(f"Predicted value y_hat(x0) = {y0_hat:.4f}")
print()
print("Pointwise confidence interval for mean response:")
print(f"({ci_mean_lower:.4f}, {ci_mean_upper:.4f})")
print()
print("Pointwise prediction interval:")
print(f"({pi_lower:.4f}, {pi_upper:.4f})")
print()
print(f"t critical value = {t_crit:.4f}")
print(f"Working-Hotelling multiplier W = {W:.4f}")

x0 = 4.5
Predicted value y_hat(x0) = 5.1045

Pointwise confidence interval for mean response:
(4.6367, 5.5724)

Pointwise prediction interval:
(3.6310, 6.5780)

t critical value = 5.0413
Working-Hotelling multiplier W = 6.0817


# Grid of x-values for plotting the fitted line and bands
x_grid = np.linspace(np.min(x), np.max(x), 400)
y_grid = b0 + b1 * x_grid

# Standard errors on the grid
SE_mean_grid = np.sqrt(s2 * (1/n + (x_grid - x_bar)**2 / Sxx))
SE_pred_grid = np.sqrt(s2 * (1 + 1/n + (x_grid - x_bar)**2 / Sxx))

# Pointwise confidence band (if you want to compare)
ci_mean_grid_lower = y_grid - t_crit * SE_mean_grid
ci_mean_grid_upper = y_grid + t_crit * SE_mean_grid

# Confidence band (Working-Hotelling)
conf_band_lower = y_grid - W * SE_mean_grid
conf_band_upper = y_grid + W * SE_mean_grid

# Prediction band (using same simultaneous multiplier W for illustration)
pred_band_lower = y_grid - W * SE_pred_grid
pred_band_upper = y_grid + W * SE_pred_grid


plt.figure(figsize=(10, 7))

# Scatter plot of data
plt.scatter(x, y, label="Data")

# Least squares line
plt.plot(x_grid, y_grid, linewidth=2, label="LS regression line")

# Confidence band
plt.fill_between(
    x_grid, conf_band_lower, conf_band_upper,
    alpha=0.20, label="Confidence band"
)

# Prediction band
plt.fill_between(
    x_grid, pred_band_lower, pred_band_upper,
    alpha=0.12, label="Prediction band"
)

# Vertical line at x0
plt.axvline(x=x0, linestyle="--", linewidth=1.5, label=r"$x_0$")

# Mark predicted point on regression line
plt.scatter([x0], [y0_hat], s=80, zorder=5, label=r"$\hat{Y}_0$")

# Draw pointwise confidence interval at x0
plt.plot([x0, x0], [ci_mean_lower, ci_mean_upper], linewidth=4, label="Pointwise CI at $x_0$")

# Draw pointwise prediction interval at x0
plt.plot([x0, x0], [pi_lower, pi_upper], linewidth=2, label="Pointwise PI at $x_0$")

# Optional horizontal markers for interval endpoints
plt.plot([x0-0.12, x0+0.12], [ci_mean_lower, ci_mean_lower], linewidth=2)
plt.plot([x0-0.12, x0+0.12], [ci_mean_upper, ci_mean_upper], linewidth=2)

plt.plot([x0-0.18, x0+0.18], [pi_lower, pi_lower], linewidth=2)
plt.plot([x0-0.18, x0+0.18], [pi_upper, pi_upper], linewidth=2)

plt.xlabel("x")
plt.ylabel("y")
plt.title("Simple Linear Regression with Pointwise Intervals and Bands")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


import pandas as pd

data = pd.read_csv(
    "hospital.txt",
    sep=r"\s+",
    header=None
)

# If there are more than 2 columns, keep only first two
data = data.iloc[:, :2]

data.columns = ["Stay", "InfectionRisk"]

# Convert to numeric explicitly (very important)
data["Stay"] = pd.to_numeric(data["Stay"], errors="coerce")
data["InfectionRisk"] = pd.to_numeric(data["InfectionRisk"], errors="coerce")

# Drop bad rows if any
data = data.dropna()

print(data.head())
print(data.shape)

   Stay  InfectionRisk
1   5.0          11.20
2  10.0           8.84
3  11.0          11.07
4  13.0          12.78
5  18.0          11.62
(58, 2)


print(data.dtypes)
print(data.describe())

Stay             float64
InfectionRisk    float64
dtype: object
             Stay  InfectionRisk
count   58.000000       58.00000
mean    52.706897       10.04931
std     30.625645        1.44107
min      2.000000        7.39000
25%     28.250000        8.90500
50%     53.000000        9.93000
75%     77.750000       11.06000
max    109.000000       13.95000


import matplotlib.pyplot as plt

x = data["Stay"].values
y = data["InfectionRisk"].values

plt.figure(figsize=(8,5))

plt.scatter(x, y, s=50)

plt.xlabel("Average Length of Stay (days)", fontsize=12)
plt.ylabel("Infection Risk (%)", fontsize=12)
plt.title("Hospital Infection Data", fontsize=14)

plt.grid(True, linestyle="--", alpha=0.4)

plt.tight_layout()
plt.show()


#compute the regression quantities by hand

import numpy as np

n = len(x)
x_bar = np.mean(x)
y_bar = np.mean(y)

Sxx = np.sum((x - x_bar)**2)
Sxy = np.sum((x - x_bar) * (y - y_bar))

b1 = Sxy / Sxx
b0 = y_bar - b1 * x_bar

y_hat = b0 + b1 * x
residuals = y - y_hat

SSE = np.sum(residuals**2)
SSR = np.sum((y_hat - y_bar)**2)
SST = np.sum((y - y_bar)**2)

MSR = SSR / 1
MSE = SSE / (n - 2)
F_stat = MSR / MSE

R2 = SSR / SST
s = np.sqrt(MSE)

print(f"n   = {n}")
print(f"b0  = {b0:.4f}")
print(f"b1  = {b1:.4f}")
print(f"SST = {SST:.4f}")
print(f"SSR = {SSR:.4f}")
print(f"SSE = {SSE:.4f}")
print(f"MSE = {MSE:.4f}")
print(f"s   = {s:.5f}")
print(f"R^2 = {R2:.4f} = {100*R2:.2f}%")
print(f"F   = {F_stat:.4f}")

n   = 58
b0  = 10.2033
b1  = -0.0029
SST = 118.3710
SSR = 0.4563
SSE = 117.9147
MSE = 2.1056
s   = 1.45108
R^2 = 0.0039 = 0.39%
F   = 0.2167


x_grid = np.linspace(np.min(x), np.max(x), 400)
y_grid = b0 + b1 * x_grid

plt.figure(figsize=(8, 5))
plt.scatter(x, y, s=50, label="Data")
plt.plot(x_grid, y_grid, linewidth=2, label="Least squares line")
plt.xlabel("Average Length of Stay (days)", fontsize=12)
plt.ylabel("Infection Risk (%)", fontsize=12)
plt.title("Hospital Infection Data with LS Regression Line", fontsize=14)
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend()
plt.tight_layout()
plt.show()


from scipy.stats import t, f

SE_b1 = np.sqrt(MSE / Sxx)
SE_b0 = np.sqrt(MSE * (1/n + x_bar**2 / Sxx))

t_b1 = b1 / SE_b1
t_b0 = b0 / SE_b0

p_b1 = 2 * (1 - t.cdf(abs(t_b1), df=n-2))
p_b0 = 2 * (1 - t.cdf(abs(t_b0), df=n-2))

anova_table = {
    "Source": ["Regression", "Error", "Total"],
    "DF": [1, n-2, n-1],
    "SS": [SSR, SSE, SST],
    "MS": [MSR, MSE, np.nan],
    "F": [F_stat, np.nan, np.nan],
    "P-value": [1 - f.cdf(F_stat, 1, n-2), np.nan, np.nan]
}

coef_table = {
    "Term": ["Intercept", "Stay"],
    "Coefficient": [b0, b1],
    "SE Coef": [SE_b0, SE_b1],
    "t-value": [t_b0, t_b1],
    "P-value": [p_b0, p_b1]
}

import pandas as pd

print("ANOVA table")
display(pd.DataFrame(anova_table))

print("Coefficient table")
display(pd.DataFrame(coef_table))

ANOVA table

Coefficient table


# Choose any point of interest
x0 = 50

y0_hat = b0 + b1 * x0

SE_mean_x0 = np.sqrt(MSE * (1/n + (x0 - x_bar)**2 / Sxx))
SE_pred_x0 = np.sqrt(MSE * (1 + 1/n + (x0 - x_bar)**2 / Sxx))

alpha = 0.05
t_crit = t.ppf(1 - alpha/2, df=n-2)

ci_mean_lower = y0_hat - t_crit * SE_mean_x0
ci_mean_upper = y0_hat + t_crit * SE_mean_x0

pi_lower = y0_hat - t_crit * SE_pred_x0
pi_upper = y0_hat + t_crit * SE_pred_x0

print(f"x0 = {x0}")
print(f"Predicted mean response at x0: {y0_hat:.4f}")
print()
print(f"95% CI for the mean response at x0: ({ci_mean_lower:.4f}, {ci_mean_upper:.4f})")
print(f"95% PI for a new observation at x0: ({pi_lower:.4f}, {pi_upper:.4f})")

x0 = 50
Predicted mean response at x0: 10.0572

95% CI for the mean response at x0: (9.6740, 10.4404)
95% PI for a new observation at x0: (7.1252, 12.9892)


#confidence band and prediction band

alpha = 0.05

# Pointwise multiplier
t_crit = t.ppf(1 - alpha/2, df=n-2)

# Working-Hotelling multiplier for simultaneous confidence band
W = np.sqrt(2 * f.ppf(1 - alpha, dfn=2, dfd=n-2))

SE_mean_grid = np.sqrt(MSE * (1/n + (x_grid - x_bar)**2 / Sxx))
SE_pred_grid = np.sqrt(MSE * (1 + 1/n + (x_grid - x_bar)**2 / Sxx))

# Pointwise confidence interval curves
pointwise_ci_lower = y_grid - t_crit * SE_mean_grid
pointwise_ci_upper = y_grid + t_crit * SE_mean_grid

# Confidence band
conf_band_lower = y_grid - W * SE_mean_grid
conf_band_upper = y_grid + W * SE_mean_grid

# Prediction band
pred_band_lower = y_grid - W * SE_pred_grid
pred_band_upper = y_grid + W * SE_pred_grid


plt.figure(figsize=(10, 6))

plt.scatter(x, y, s=45, label="Data")
plt.plot(x_grid, y_grid, linewidth=2, label="LS regression line")

plt.fill_between(
    x_grid, conf_band_lower, conf_band_upper,
    alpha=0.20, label="Confidence band"
)

plt.fill_between(
    x_grid, pred_band_lower, pred_band_upper,
    alpha=0.10, label="Prediction band"
)

plt.axvline(x=x0, linestyle="--", linewidth=1.5, label=r"$x_0$")
plt.scatter([x0], [y0_hat], s=80, zorder=5, label=r"$\hat{Y}_0$")

# Pointwise CI at x0
plt.plot([x0, x0], [ci_mean_lower, ci_mean_upper], linewidth=4, label="Pointwise CI at $x_0$")

# Pointwise PI at x0
plt.plot([x0, x0], [pi_lower, pi_upper], linewidth=2, label="Pointwise PI at $x_0$")

plt.xlabel("Average Length of Stay (days)", fontsize=12)
plt.ylabel("Infection Risk (%)", fontsize=12)
plt.title("Hospital Infection Data: Regression, Intervals, and Bands", fontsize=14)
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend()
plt.tight_layout()
plt.show()

	Source	DF	SS	MS	F	P-value
0	Regression	1	0.456263	0.456263	0.216688	0.64338
1	Error	56	117.914710	2.105620	NaN	NaN
2	Total	57	118.370972	NaN	NaN	NaN

	Term	Coefficient	SE Coef	t-value	P-value
0	Intercept	10.203286	0.381729	26.729144	0.00000
1	Stay	-0.002921	0.006276	-0.465498	0.64338

Seminar 11

1. Coefficient of Determination ($R^2$)

Definition¶

Total Variability Decomposition¶

Definition of $R^2$¶

Alternative Expression¶

Interpretation¶

Example 1: Strong Linear Relationship¶

Example 2: Weak Linear Relationship¶

Confidence Interval for the Slope $\beta_1$

📌 Distribution of the OLS Slope Estimator $\hat{\beta}_1$¶

1. Model Assumptions¶

2. Expression for the OLS Estimator¶

3. Substituting the Model¶

4. Simplifications¶

5. Final Representation¶

6. Expectation¶

7. Variance¶

8. Normality¶

9. Final Result¶

10. Interpretation¶

⚠️ Important Remark¶

2. Residual Sum of Squares (SSE)¶

4. Unbiasedness of $s^2$¶

5. Variance of $\hat{\beta}_1$¶

6. Estimation of the Variance¶

7. Standard Error¶

8. Distribution of the Standardized Estimator¶

9. Independence (crucial result)¶

10. Construction of the $t$-statistic¶

11. Distribution¶

12. Final Result¶

13. Confidence Interval¶

Confidence Interval for the Intercept $\beta_0$

Estimator¶

Variance¶

Estimated Variance¶

Standard Error¶

Confidence Interval¶

4. Hypothesis Testing for Slope and Intercept

Test for Slope¶

Hypotheses¶

Test Statistic¶

Decision Rule¶

Test for Intercept¶

Hypotheses¶

Test Statistic¶

5. Prediction Intervals in Linear Regression

Mean Response¶

Variance (Mean Response)¶

📌 Variance of the Mean Response $\hat{Y}_0$¶

1. Rewrite $\hat{Y}_0$¶

3. Variance Decomposition¶

4. Compute Each Term¶

(i) Variance of $\bar{Y}$¶

(ii) Variance of $\hat{\beta}_1$¶

(iii) Covariance term¶

5. Final Formula¶

4. Distribution for the mean response estimator¶

5. Confidence interval for the mean response¶

6. Prediction of a new observation¶

7. Prediction error¶

8. Variance of the prediction error¶

9. $t$-statistic for prediction¶

10. Prediction interval¶

11. Why is the prediction interval wider?¶

Confidence interval for the mean response¶

Prediction interval¶

📊 Confidence Bands and Prediction Bands in Linear Regression¶

1. Motivation¶

2. From Intervals to Bands¶

Pointwise interval (what we had before)¶

Band (new concept)¶

3. Confidence Band for the Mean Response¶

Key idea¶

4. Working–Hotelling Confidence Band¶

Interpretation¶

5. Comparison with Pointwise CI¶

Important¶

6. Prediction Band¶