import numpy as np
import pandas as pd
from scipy.stats import f


def two_way_anova_with_interaction(data, alpha=0.05):
    """
    Two-Way ANOVA WITH interaction for a balanced design with replication.

    Parameters
    ----------
    data : dict
        Nested dict of the form:
        data[A_level][B_level] = list (replicates in that cell)
        Example:
        {
          "A1": {"B1": [..], "B2": [..]},
          "A2": {"B1": [..], "B2": [..]}
        }

        Requirements:
        - Same B levels for every A level
        - Same number of replicates n in every cell (balanced)
        - Replication required: n >= 2

    alpha : float
        Significance level (for critical values)

    Returns
    -------
    dict with:
      - anova_table (pandas DataFrame)
      - means (grand, row, col, cell)
      - ss (SS_A, SS_B, SS_AB, SS_E, SS_T)
      - dfs, ms, F, p-values, critical values
    """

    # ---------- Parse levels ----------
    A_levels = list(data.keys())
    if len(A_levels) < 2:
        raise ValueError("Need at least 2 levels of factor A.")

    B_levels = list(data[A_levels[0]].keys())
    if len(B_levels) < 2:
        raise ValueError("Need at least 2 levels of factor B.")

    # ---------- Validate structure and balance ----------
    a = len(A_levels)
    b = len(B_levels)

    # Check all A levels contain same B levels
    for alev in A_levels:
        if set(data[alev].keys()) != set(B_levels):
            raise ValueError("All A levels must contain the same set of B levels.")

    # Check balanced replication n per cell
    n_list = []
    for alev in A_levels:
        for blev in B_levels:
            cell = data[alev][blev]
            if not isinstance(cell, (list, tuple, np.ndarray)) or len(cell) == 0:
                raise ValueError(f"Cell ({alev}, {blev}) must be a non-empty list of replicates.")
            n_list.append(len(cell))

    if len(set(n_list)) != 1:
        raise ValueError("Balanced design required: all cells must have the same number of replicates.")
    n = n_list[0]
    if n < 2:
        raise ValueError("Replication required: each cell must have n >= 2 for SS_E and interaction testing.")

    N = a * b * n

    # ---------- Build arrays for easier computation ----------
    # y[i,j,k]
    y = np.zeros((a, b, n), dtype=float)
    for i, alev in enumerate(A_levels):
        for j, blev in enumerate(B_levels):
            y[i, j, :] = np.array(data[alev][blev], dtype=float)

    # ---------- Means ----------
    grand_mean = y.mean()
    cell_means = y.mean(axis=2)          # shape (a,b)
    row_means = y.mean(axis=(1, 2))      # shape (a,)
    col_means = y.mean(axis=(0, 2))      # shape (b,)

    # ---------- Sums of Squares ----------
    # Total
    SS_T = np.sum((y - grand_mean) ** 2)

    # Error (within-cell)
    SS_E = np.sum((y - cell_means[:, :, None]) ** 2)

    # A
    SS_A = b * n * np.sum((row_means - grand_mean) ** 2)

    # B
    SS_B = a * n * np.sum((col_means - grand_mean) ** 2)

    # Interaction
    SS_AB = n * np.sum((cell_means - row_means[:, None] - col_means[None, :] + grand_mean) ** 2)

    # Check decomposition (numerical tolerance)
    # SS_T should equal SS_A + SS_B + SS_AB + SS_E (balanced with interaction)
    # We'll not raise error; we return the discrepancy.
    discrepancy = SS_T - (SS_A + SS_B + SS_AB + SS_E)

    # ---------- Degrees of Freedom ----------
    df_A = a - 1
    df_B = b - 1
    df_AB = (a - 1) * (b - 1)
    df_E = a * b * (n - 1)
    df_T = N - 1

    # ---------- Mean Squares ----------
    MS_A = SS_A / df_A
    MS_B = SS_B / df_B
    MS_AB = SS_AB / df_AB
    MS_E = SS_E / df_E

    # ---------- F statistics ----------
    F_A = MS_A / MS_E
    F_B = MS_B / MS_E
    F_AB = MS_AB / MS_E

    # ---------- p-values ----------
    p_A = 1 - f.cdf(F_A, df_A, df_E)
    p_B = 1 - f.cdf(F_B, df_B, df_E)
    p_AB = 1 - f.cdf(F_AB, df_AB, df_E)

    # ---------- Critical values (right tail) ----------
    Fcrit_A = f.ppf(1 - alpha, df_A, df_E)
    Fcrit_B = f.ppf(1 - alpha, df_B, df_E)
    Fcrit_AB = f.ppf(1 - alpha, df_AB, df_E)

   # ---------- ANOVA table (USE np.nan instead of "") ----------
    anova_df = pd.DataFrame(
        [
            ["Factor A", SS_A, df_A, MS_A, F_A,  p_A,  Fcrit_A],
            ["Factor B", SS_B, df_B, MS_B, F_B,  p_B,  Fcrit_B],
            ["A×B",      SS_AB, df_AB, MS_AB, F_AB, p_AB, Fcrit_AB],
            ["Error",    SS_E, df_E, MS_E, np.nan, np.nan, np.nan],
            ["Total",    SS_T, df_T, np.nan, np.nan, np.nan, np.nan],
        ],
        columns=["Source", "SS", "df", "MS", "F", "p-value", "F_crit"]
    )

    # Nice packaging
    return {
        "anova_table": anova_df,
        "means": {
            "grand_mean": grand_mean,
            "row_means": dict(zip(A_levels, row_means)),
            "col_means": dict(zip(B_levels, col_means)),
            "cell_means": {
                (A_levels[i], B_levels[j]): cell_means[i, j]
                for i in range(a) for j in range(b)
            }
        },
        "ss": {"SS_A": SS_A, "SS_B": SS_B, "SS_AB": SS_AB, "SS_E": SS_E, "SS_T": SS_T},
        "dfs": {"df_A": df_A, "df_B": df_B, "df_AB": df_AB, "df_E": df_E, "df_T": df_T},
        "ms": {"MS_A": MS_A, "MS_B": MS_B, "MS_AB": MS_AB, "MS_E": MS_E},
        "F": {"F_A": F_A, "F_B": F_B, "F_AB": F_AB},
        "p_values": {"p_A": p_A, "p_B": p_B, "p_AB": p_AB},
        "F_crit": {"Fcrit_A": Fcrit_A, "Fcrit_B": Fcrit_B, "Fcrit_AB": Fcrit_AB},
        "decomposition_discrepancy": discrepancy
    }


# Input (same as before)
data = {
    "A1": {"B1": [10, 12], "B2": [20, 22]},
    "A2": {"B1": [20, 22], "B2": [10, 12]},
}

alpha = 0.05
res = two_way_anova_with_interaction(data, alpha=alpha)

anova_df = res["anova_table"].copy()

display(
    anova_df.style
    .format(
        {
            "SS": "{:.4f}",
            "MS": "{:.4f}",
            "F": "{:.4f}",
            "p-value": "{:.6g}",
            "F_crit": "{:.4f}",
        },
        na_rep=""   # show NaNs as blank
    )
)

print("Decomposition discrepancy SS_T - (SS_A+SS_B+SS_AB+SS_E) =",
      float(res["decomposition_discrepancy"]))

Decomposition discrepancy SS_T - (SS_A+SS_B+SS_AB+SS_E) = 0.0

Source	SS	df	MS	F
Factor A	$$SS_A$$	$$a-1$$	$$MS_A$$	$$F_A=MS_A/MS_E$$
Factor B	$$SS_B$$	$$b-1$$	$$MS_B$$	$$F_B=MS_B/MS_E$$
Interaction AB	$$SS_{AB}$$	$$(a-1)(b-1)$$	$$MS_{AB}$$	$$F_{AB}=MS_{AB}/MS_E$$
Error	$$SS_E$$	$$ab(n-1)$$	$$MS_E$$	—
Total	$$SS_T$$	$$abn-1$$	—	—

	$B_1$	$B_2$
$A_1$	$10,12$	$20,22$
$A_2$	$20,22$	$10,12$

Source	SS	df	MS	F
A	$0$	$1$	$0$	$0$
B	$0$	$1$	$0$	$0$
A×B	$200$	$1$	$200$	$100$
Error	$8$	$4$	$2$	—
Total	$208$	$7$	—	—

	Source	SS	df	MS	F	p-value	F_crit
0	Factor A	0.0000	1	0.0000	0.0000	1	7.7086
1	Factor B	0.0000	1	0.0000	0.0000	1	7.7086
2	A×B	200.0000	1	200.0000	100.0000	0.000562004	7.7086
3	Error	8.0000	4	2.0000
4	Total	208.0000	7

Seminar 6

Two-Way ANOVA

Two-Way ANOVA (Factorial ANOVA): Complete Theory, Derivations, and Formulas¶

1. Data Structure and Notation¶

Means¶

2. The Two-Way ANOVA Model (Fixed Effects)¶

2.1 Model with Interaction¶

2.2 Identifiability Constraints (Why we need them)¶

2.3 Mean Structure¶

3. Hypotheses Tested¶

3.1 Interaction First (Important logic)¶

3.2 Main Effects (when interaction is absent or not of interest)¶

4. Balanced Two-Way ANOVA (Cleanest Case)¶

5. Sums of Squares: Where do they come from?¶

5.1 Total Sum of Squares¶

5.2 Within-Cell (Error) Sum of Squares¶

6. Deriving the Decomposition (Key Mathematical Reason)¶

7. Main Effects and Interaction Sums of Squares (Balanced Case)¶

7.1 Factor A Sum of Squares¶

7.2 Factor B Sum of Squares¶

7.3 Interaction Sum of Squares¶

7.4 Full Decomposition¶

8. Degrees of Freedom (Why these numbers?)¶

9. Mean Squares and Why F-tests Work¶

9.1 Key distributional reason: Quadratic forms in normals¶

10. The Two-Way ANOVA Table (Balanced Case)¶

11. Interpretation Logic (Correct order)¶

12. Two-Way ANOVA WITHOUT Replication (One observation per cell)¶

13. Unbalanced Designs (Unequal $$n_{ij}$$): What changes?¶

14. Connection to Linear Models (Why ANOVA is regression)¶

15. Assumptions and Diagnostics¶

16. Summary (Exam-ready)¶

Linear Algebra and Geometric Foundations of Two-Way ANOVA¶

1. ANOVA as a Linear Model¶

2. Design Matrix Structure (Two-Way ANOVA)¶

3. Orthogonal Projection Interpretation¶

4. Sums of Squares as Squared Projection Lengths¶

5. Why Degrees of Freedom Equal Subspace Dimensions¶

6. Distribution of Sums of Squares (Key Probability Fact)¶

7. Consequence for Mean Squares¶

8. Why the F-Test Works¶

9. Why Balanced Designs Are Special¶

10. Deep Interpretation (Big Picture)¶

Final One-Line Summary¶

Two-Way ANOVA with Interaction — Compact Worked Example (all computations)¶

Data¶

Hypotheses¶

Sums of Squares (balanced formulas)¶

Error (within-cell) sum of squares¶

Factor A sum of squares¶

Factor B sum of squares¶

Interaction sum of squares¶

SS_{AB}¶

SS_{AB}¶

Total sum of squares (check)¶

Degrees of Freedom¶

Mean Squares and F statistics¶

ANOVA Table (compact)¶

Conclusion (interpretation)¶