import numpy as np
import matplotlib.pyplot as plt

def simple_linear_regression(x, y, show_plot=True):
    """
    Simple linear regression for one predictor.

    Parameters
    ----------
    x : array-like
        Predictor values.
    y : array-like
        Response values.
    show_plot : bool, default=True
        If True, draws the scatter plot and the fitted regression line.

    Returns
    -------
    results : dict
        Dictionary containing:
        - intercept
        - slope
        - y_hat
        - residuals
        - SSE
        - SST
        - SSR
        - R2
    """
    x = np.array(x, dtype=float)
    y = np.array(y, dtype=float)

    if len(x) != len(y):
        raise ValueError("x and y must have the same length.")

    n = len(x)
    if n < 2:
        raise ValueError("At least two data points are required.")

    x_bar = np.mean(x)
    y_bar = np.mean(y)

    Sxx = np.sum((x - x_bar) ** 2)
    Sxy = np.sum((x - x_bar) * (y - y_bar))

    if Sxx == 0:
        raise ValueError("All x values are equal, so the slope is undefined.")

    # Least squares estimators
    slope = Sxy / Sxx
    intercept = y_bar - slope * x_bar

    # Fitted values and residuals
    y_hat = intercept + slope * x
    residuals = y - y_hat

    # Sums of squares
    SSE = np.sum((y - y_hat) ** 2)
    SST = np.sum((y - y_bar) ** 2)
    SSR = np.sum((y_hat - y_bar) ** 2)
    R2 = SSR / SST if SST != 0 else 1.0

    if show_plot:
        plt.figure(figsize=(8, 5))
        plt.scatter(x, y, s=60, label="Observed data")

        x_line = np.linspace(np.min(x), np.max(x), 200)
        y_line = intercept + slope * x_line
        plt.plot(x_line, y_line, linewidth=2, label="Least squares line")

        plt.xlabel("x")
        plt.ylabel("y")
        plt.title("Scatter plot with fitted regression line")
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.show()

    print("Simple Linear Regression Results")
    print("--------------------------------")
    print(f"Intercept (b0) = {intercept:.6f}")
    print(f"Slope (b1)     = {slope:.6f}")
    print(f"Fitted line    = y_hat = {intercept:.6f} + {slope:.6f}x")
    print(f"SSE            = {SSE:.6f}")
    print(f"SSR            = {SSR:.6f}")
    print(f"SST            = {SST:.6f}")
    print(f"R^2            = {R2:.6f}")

    return {
        "intercept": intercept,
        "slope": slope,
        "y_hat": y_hat,
        "residuals": residuals,
        "SSE": SSE,
        "SSR": SSR,
        "SST": SST,
        "R2": R2
    }


# Input data
hours_studied = [20, 16, 20, 18, 17, 16, 15, 17, 15, 16, 15, 17, 16, 17, 14]
grade_on_exam = [89, 72, 93, 84, 81, 75, 70, 82, 69, 83, 80, 83, 81, 84, 76]

# Run simple linear regression
results = simple_linear_regression(hours_studied, grade_on_exam, show_plot=True)

Simple Linear Regression Results
--------------------------------
Intercept (b0) = 26.741987
Slope (b1)     = 3.216346
Fitted line    = y_hat = 26.741987 + 3.216346x
SSE            = 201.386218
SSR            = 430.347115
SST            = 631.733333
R^2            = 0.681216


import numpy as np
from scipy import stats

def regression_f_test(x, y, alpha=0.05):
    """
    F-test for significance of simple linear regression.

    H0: beta1 = 0  (no relationship)
    H1: beta1 != 0

    Returns
    -------
    dictionary with regression and test results
    """

    x = np.array(x, dtype=float)
    y = np.array(y, dtype=float)

    n = len(x)

    x_bar = np.mean(x)
    y_bar = np.mean(y)

    Sxx = np.sum((x - x_bar)**2)
    Sxy = np.sum((x - x_bar)*(y - y_bar))

    # regression coefficients
    b1 = Sxy / Sxx
    b0 = y_bar - b1*x_bar

    # fitted values
    y_hat = b0 + b1*x

    # sums of squares
    SSE = np.sum((y - y_hat)**2)
    SST = np.sum((y - y_bar)**2)
    SSR = SST - SSE

    # degrees of freedom
    df_reg = 1
    df_err = n - 2

    # mean squares
    MSR = SSR / df_reg
    MSE = SSE / df_err

    # F statistic
    F = MSR / MSE

    # p-value
    p_value = 1 - stats.f.cdf(F, df_reg, df_err)

    # critical value
    F_crit = stats.f.ppf(1 - alpha, df_reg, df_err)

    print("F-Test for Significance of Regression")
    print("------------------------------------")
    print(f"n = {n}")
    print(f"Intercept (b0) = {b0:.6f}")
    print(f"Slope (b1)     = {b1:.6f}")
    print()
    print("ANOVA Table")
    print("------------------------------------")
    print(f"SSR (Regression) = {SSR:.4f}")
    print(f"SSE (Error)      = {SSE:.4f}")
    print(f"SST (Total)      = {SST:.4f}")
    print()
    print(f"MSR = {MSR:.4f}")
    print(f"MSE = {MSE:.4f}")
    print()
    print(f"F statistic = {F:.4f}")
    print(f"F critical  = {F_crit:.4f}")
    print(f"p-value     = {p_value:.6f}")
    print()

    if F > F_crit:
        print("Reject H0: There is a significant linear relationship.")
    else:
        print("Fail to reject H0: No significant relationship detected.")

    return {
        "F": F,
        "p_value": p_value,
        "b0": b0,
        "b1": b1,
        "SSE": SSE,
        "SSR": SSR,
        "SST": SST
    }


# Data
hours_studied = [20,16,20,18,17,16,15,17,15,16,15,17,16,17,14]
grade_on_exam = [89,72,93,84,81,75,70,82,69,83,80,83,81,84,76]

# Run the F-test
result = regression_f_test(hours_studied, grade_on_exam, alpha=0.05)

F-Test for Significance of Regression
------------------------------------
n = 15
Intercept (b0) = 26.741987
Slope (b1)     = 3.216346

ANOVA Table
------------------------------------
SSR (Regression) = 430.3471
SSE (Error)      = 201.3862
SST (Total)      = 631.7333

MSR = 430.3471
MSE = 15.4912

F statistic = 27.7800
F critical  = 4.6672
p-value     = 0.000151

Reject H0: There is a significant linear relationship.


import numpy as np

def coefficient_of_determination(x, y):
    """
    Computes the coefficient of determination (R^2)
    for a simple linear regression model.
    """

    x = np.array(x, dtype=float)
    y = np.array(y, dtype=float)

    n = len(x)

    x_bar = np.mean(x)
    y_bar = np.mean(y)

    # Compute regression coefficients
    Sxx = np.sum((x - x_bar)**2)
    Sxy = np.sum((x - x_bar)*(y - y_bar))

    b1 = Sxy / Sxx
    b0 = y_bar - b1*x_bar

    # Predicted values
    y_hat = b0 + b1*x

    # Sums of squares
    SSE = np.sum((y - y_hat)**2)
    SST = np.sum((y - y_bar)**2)
    SSR = SST - SSE

    # R^2
    R2 = SSR / SST

    print("Coefficient of Determination")
    print("-----------------------------")
    print(f"SST = {SST:.4f}")
    print(f"SSR = {SSR:.4f}")
    print(f"SSE = {SSE:.4f}")
    print()
    print(f"R^2 = {R2:.6f}")
    print()
    print(f"Interpretation: {R2*100:.2f}% of the variability in y")
    print("is explained by the linear regression model.")

    return R2


hours_studied = [20,16,20,18,17,16,15,17,15,16,15,17,16,17,14]
grade_on_exam = [89,72,93,84,81,75,70,82,69,83,80,83,81,84,76]

R2 = coefficient_of_determination(hours_studied, grade_on_exam)

Coefficient of Determination
-----------------------------
SST = 631.7333
SSR = 430.3471
SSE = 201.3862

R^2 = 0.681216

Interpretation: 68.12% of the variability in y
is explained by the linear regression model.

Seminar 9

Linear Regression¶

Simple and Multiple Linear Regression¶

1. Motivation¶

2. The Linear Regression Model¶

3. Simple Linear Regression¶

4. Least Squares Estimation¶

5. Closed-form solution¶

6. Interpretation of the slope¶

7. Residuals¶

8. Variance of the estimator¶

9. Hypothesis Testing¶

10. Confidence Interval¶

11. Coefficient of Determination ($R^2$)¶

12. Multiple Linear Regression¶

13. Matrix Formulation¶

14. Least Squares in Matrix Form¶

15. Fitted Values and Residuals¶

16. Variance of the estimator¶

17. Testing regression coefficients¶

18. Global F-test¶

19. Important assumptions¶

20. Regression in Data Science¶

Summary¶