import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import pearsonr, spearmanr, kendalltau, rankdata
from itertools import combinations


def correlation_summary(x, y):
    pearson = pearsonr(x, y)[0]
    spearman = spearmanr(x, y)[0]
    kendall = kendalltau(x, y)[0]
    return pearson, spearman, kendall


np.random.seed(42)

n = 100
x = np.linspace(0, 10, n)
y = 2 * x + np.random.normal(scale=2, size=n)

pearson, spearman, kendall = correlation_summary(x, y)

df = pd.DataFrame({"x": x, "y": y})

fig = px.scatter(
    df, x="x", y="y", trendline="ols",
    title=f"Linear Relationship<br>Pearson={pearson:.3f}, Spearman={spearman:.3f}, Kendall={kendall:.3f}"
)
fig.show()


np.random.seed(42)

x = np.linspace(0.1, 10, n)
y = np.log(x) + np.random.normal(scale=0.08, size=n)

pearson, spearman, kendall = correlation_summary(x, y)

df = pd.DataFrame({"x": x, "y": y})

fig = px.scatter(
    df, x="x", y="y",
    title=f"Monotonic but Nonlinear Relationship<br>Pearson={pearson:.3f}, Spearman={spearman:.3f}, Kendall={kendall:.3f}"
)
fig.show()


np.random.seed(42)

x = np.linspace(-3, 3, n)
y = x**2 + np.random.normal(scale=0.8, size=n)

pearson, spearman, kendall = correlation_summary(x, y)

df = pd.DataFrame({"x": x, "y": y})

fig = px.scatter(
    df, x="x", y="y",
    title=f"Non-monotonic Relationship<br>Pearson={pearson:.3f}, Spearman={spearman:.3f}, Kendall={kendall:.3f}"
)
fig.show()


np.random.seed(42)

x = np.linspace(0, 10, n)
y = x + np.random.normal(scale=1.0, size=n)

# add one extreme outlier
x_out = np.append(x, [10])
y_out = np.append(y, [40])

pearson, spearman, kendall = correlation_summary(x_out, y_out)

df = pd.DataFrame({"x": x_out, "y": y_out})

fig = px.scatter(
    df, x="x", y="y", trendline="ols",
    title=f"Linear Relationship with Outlier<br>Pearson={pearson:.3f}, Spearman={spearman:.3f}, Kendall={kendall:.3f}"
)
fig.show()


np.random.seed(42)
n = 200

datasets = {}

# Linear
x1 = np.linspace(0, 10, n)
y1 = 3 * x1 + np.random.normal(scale=3, size=n)
datasets["Linear"] = (x1, y1)

# Monotonic nonlinear
x2 = np.linspace(0.1, 10, n)
y2 = np.sqrt(x2) + np.random.normal(scale=0.12, size=n)
datasets["Monotonic nonlinear"] = (x2, y2)

# Non-monotonic
x3 = np.linspace(-3, 3, n)
y3 = x3**2 + np.random.normal(scale=0.7, size=n)
datasets["Non-monotonic"] = (x3, y3)

# Linear with outlier
x4 = np.linspace(0, 10, n)
y4 = x4 + np.random.normal(scale=1.0, size=n)
x4 = np.append(x4, 10)
y4 = np.append(y4, 40)
datasets["With outlier"] = (x4, y4)

rows = []
for name, (xv, yv) in datasets.items():
    p, s, k = correlation_summary(xv, yv)
    rows.append({
        "Dataset": name,
        "Pearson": p,
        "Spearman": s,
        "Kendall": k
    })

corr_df = pd.DataFrame(rows)
corr_long = corr_df.melt(id_vars="Dataset", var_name="Coefficient", value_name="Value")

fig = px.bar(
    corr_long,
    x="Dataset",
    y="Value",
    color="Coefficient",
    barmode="group",
    title="Comparison of Pearson, Spearman, and Kendall Across Different Dependence Structures"
)
fig.show()


import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr, kendalltau

# Data
russia = np.array([108.2, 117.4, 123.5, 113.9, 119.0, 124.1, 128.4,
                   108.5, 118.0, 124.2, 114.5, 119.6, 124.6, 128.7])

belarus = np.array([107, 116, 118, 101, 105, 111, 111,
                    108, 117, 121, 103, 108, 114, 115])

# Compute coefficients and p-values
pearson_corr, pearson_p = pearsonr(russia, belarus)
spearman_corr, spearman_p = spearmanr(russia, belarus)
kendall_corr, kendall_p = kendalltau(russia, belarus)

print("Pearson correlation:")
print(f"  coefficient = {pearson_corr:.6f}")
print(f"  p-value     = {pearson_p:.6f}\n")

print("Spearman correlation:")
print(f"  coefficient = {spearman_corr:.6f}")
print(f"  p-value     = {spearman_p:.6f}\n")

print("Kendall correlation:")
print(f"  coefficient = {kendall_corr:.6f}")
print(f"  p-value     = {kendall_p:.6f}")

# Scatter plot
plt.figure(figsize=(7, 5))
plt.scatter(russia, belarus)
plt.xlabel("Russia GDP growth rate")
plt.ylabel("Belarus GDP growth rate")
plt.title("Scatter Plot of GDP Growth Rates")
plt.grid(True)
plt.show()

Pearson correlation:
  coefficient = 0.554708
  p-value     = 0.039516

Spearman correlation:
  coefficient = 0.537446
  p-value     = 0.047474

Kendall correlation:
  coefficient = 0.411136
  p-value     = 0.042188

Property	Pearson	Spearman	Kendall
Measures	Linear dependence	Monotonic dependence	Pairwise agreement
Uses	Raw values	Ranks	Pair comparisons
Sensitive to outliers	Yes	Less	Very low
Captures nonlinear	No	Yes (monotonic)	Yes (monotonic)
Interpretation	Covariance-based	Rank correlation	Probability of concordance
Efficiency (normal data)	Highest	Medium	Lower
Robustness	Low	Medium	High

Feature	Pearson Test	Spearman Test	Kendall Test
Null hypothesis	$\rho=0$	$\rho_s=0$	$\tau=0$
Distribution	$t_{n-2}$	approx $t$ or normal	normal
Assumptions	Normality	None	None
Measures	Linear dependence	Monotonic dependence	Pairwise concordance
Robustness	Low	Medium	High

Seminar 10

Correlation Coefficients: Theory and Comparison¶

1. General Idea of Correlation¶

2. Pearson Correlation Coefficient¶

Definition¶

Sample Version¶

Interpretation¶

Properties¶

Limitations¶

3. Spearman Rank Correlation¶

Definition¶

Interpretation¶

Theorem (No Ties Case)¶

Mathematical Derivation¶

Step 1: Properties of ranks¶

Step 2: Variance of ranks¶

Step 3: Expand covariance¶

Step 4: Solve for covariance¶

Step 5: Substitute into Pearson formula¶

Step 6: Final simplification¶

Key Insight¶

Important Remark¶

Interpretation¶

Properties¶

Limitations¶

4. Kendall's Tau¶

Idea¶

Definition¶

Interpretation¶

Properties¶

Advantages¶

Limitations¶

5. Comparison of Pearson, Spearman, Kendall¶

6. When to Use Which?¶

Use Pearson if:¶

Use Spearman if:¶

Use Kendall if:¶

7. Key Insight¶

8. Important Remark¶

Interactive Visualization of Correlation Coefficients¶

1. Utility function to compute the three correlations¶

2. Example 1: Linear relationship¶

3. Example 2: Monotonic but nonlinear relationship¶

4. Example 3: Non-monotonic relationship¶

5. Example 4: Effect of an outlier¶

8. Interactive comparison on several datasets¶

Statistical Tests for Correlation Coefficients¶

1. Pearson Correlation Test¶

Hypotheses¶

Assumptions¶

Test Statistic¶

Distribution Under $H_0$¶

Decision Rule¶

Interpretation¶

2. Spearman Rank Correlation Test¶

Hypotheses¶

Exact Distribution (Small Samples)¶

Asymptotic Approximation¶

Alternative Approximation (Normal)¶

Key Idea¶

Advantages¶

Limitations¶

3. Kendall's Tau Test¶

Hypotheses¶

Definition Recall¶

Asymptotic Distribution¶

Test Statistic¶

Decision Rule¶

Interpretation¶

4. Comparison of Tests¶

5. Efficiency Comparison¶

8. Summary¶

9. Final Warning¶

Problem¶

Full Solution¶

1. Idea of the approach¶

2. Data¶

3. Pearson correlation coefficient¶

Definition¶

Hypotheses¶