import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import jarque_bera, shapiro, anderson, chisquare, norm
import scipy.stats as stats

# -----------------------------
# Generate normal data
# -----------------------------

np.random.seed(42)

x = np.random.normal(loc=0, scale=1, size=1000)

alpha = 0.05

# -----------------------------
# 1. Jarque-Bera test
# -----------------------------

jb_stat, jb_p = jarque_bera(x)

# -----------------------------
# 2. Shapiro-Wilk test
# -----------------------------

shapiro_stat, shapiro_p = shapiro(x)

# -----------------------------
# 3. Anderson-Darling test
# -----------------------------

ad_result = anderson(x, dist="norm")

ad_stat = ad_result.statistic

# scipy gives critical values, not p-values
# We compare the statistic with the 5% critical value

ad_critical_5 = ad_result.critical_values[
    list(ad_result.significance_level).index(5.0)
]

ad_decision = (
    "Reject H0"
    if ad_stat > ad_critical_5
    else "Fail to reject H0"
)

# -----------------------------
# 4. Pearson Chi-Square test
# -----------------------------

# Estimate parameters of normal distribution
mu_hat = np.mean(x)
sigma_hat = np.std(x, ddof=1)

# Choose number of bins
k = 10

# Use equal-probability bins under fitted normal distribution
probabilities = np.linspace(0, 1, k + 1)

bin_edges = norm.ppf(probabilities, loc=mu_hat, scale=sigma_hat)

# Replace infinities at the endpoints
bin_edges[0] = -np.inf
bin_edges[-1] = np.inf

# Observed frequencies
observed, _ = np.histogram(x, bins=bin_edges)

# Expected frequencies
expected = np.ones(k) * len(x) / k

# Pearson chi-square statistic manually
chi_stat = np.sum((observed - expected) ** 2 / expected)

# Degrees of freedom:
# k bins - 1 constraint - 2 estimated parameters
df = k - 1 - 2

chi_p = 1 - stats.chi2.cdf(chi_stat, df=df)

# -----------------------------
# Create comparison table
# -----------------------------

results = pd.DataFrame({
    "Test": [
        "Jarque-Bera",
        "Shapiro-Wilk",
        "Anderson-Darling",
        "Pearson Chi-Square"
    ],
    "Statistic": [
        jb_stat,
        shapiro_stat,
        ad_stat,
        chi_stat
    ],
    "p-value / critical value": [
        jb_p,
        shapiro_p,
        f"5% critical value = {ad_critical_5:.4f}",
        chi_p
    ],
    "Decision at alpha=0.05": [
        "Reject H0" if jb_p < alpha else "Fail to reject H0",
        "Reject H0" if shapiro_p < alpha else "Fail to reject H0",
        ad_decision,
        "Reject H0" if chi_p < alpha else "Fail to reject H0"
    ]
})

results


# -----------------------------
# Print detailed output
# -----------------------------

print("Jarque-Bera test")
print("Statistic:", jb_stat)
print("p-value:", jb_p)
print()

print("Shapiro-Wilk test")
print("Statistic:", shapiro_stat)
print("p-value:", shapiro_p)
print()

print("Anderson-Darling test")
print("Statistic:", ad_stat)
print("Critical values:", ad_result.critical_values)
print("Significance levels:", ad_result.significance_level)
print("5% critical value:", ad_critical_5)
print()

print("Pearson Chi-Square test")
print("Statistic:", chi_stat)
print("Degrees of freedom:", df)
print("p-value:", chi_p)

Jarque-Bera test
Statistic: 2.4563732018799604
p-value: 0.2928231016469254

Shapiro-Wilk test
Statistic: 0.9986082911491394
p-value: 0.6266290545463562

Anderson-Darling test
Statistic: 0.3474697767348971
Critical values: [0.574 0.653 0.784 0.914 1.088]
Significance levels: [15.  10.   5.   2.5  1. ]
5% critical value: 0.784

Pearson Chi-Square test
Statistic: 7.700000000000001
Degrees of freedom: 7
p-value: 0.3597893327261251


# -----------------------------
# Plot histogram
# -----------------------------

plt.figure(figsize=(8, 5))

plt.hist(x, bins=30, density=True, alpha=0.6)

x_grid = np.linspace(min(x), max(x), 300)
normal_density = norm.pdf(x_grid, loc=mu_hat, scale=sigma_hat)

plt.plot(x_grid, normal_density, linewidth=2, label="Fitted normal density")

plt.title("Histogram of Normal Data")
plt.xlabel("x")
plt.ylabel("Density")
plt.legend()
plt.grid(True)
plt.show()


# -----------------------------
# QQ-plot
# -----------------------------

stats.probplot(x, dist="norm", plot=plt)

plt.title("QQ-Plot of Normal Data")
plt.grid(True)
plt.show()


# -----------------------------
# Pearson Chi-Square observed vs expected counts
# -----------------------------

pearson_table = pd.DataFrame({
    "Bin": np.arange(1, k + 1),
    "Observed": observed,
    "Expected": expected
})

pearson_table


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import jarque_bera, shapiro, anderson, chisquare, norm
import scipy.stats as stats

# -----------------------------
# Generate exponential data
# -----------------------------

np.random.seed(42)

x = np.random.exponential(scale=1, size=1000)

alpha = 0.05

# -----------------------------
# 1. Jarque-Bera test
# -----------------------------

jb_stat, jb_p = jarque_bera(x)

# -----------------------------
# 2. Shapiro-Wilk test
# -----------------------------

shapiro_stat, shapiro_p = shapiro(x)

# -----------------------------
# 3. Anderson-Darling test
# -----------------------------

ad_result = anderson(x, dist="norm")

ad_stat = ad_result.statistic

# scipy returns critical values instead of p-values

ad_critical_5 = ad_result.critical_values[
    list(ad_result.significance_level).index(5.0)
]

ad_decision = (
    "Reject H0"
    if ad_stat > ad_critical_5
    else "Fail to reject H0"
)

# -----------------------------
# 4. Pearson Chi-Square test
# -----------------------------

# Estimate normal parameters
mu_hat = np.mean(x)
sigma_hat = np.std(x, ddof=1)

# Number of bins
k = 10

# Equal-probability bins under fitted normal
probabilities = np.linspace(0, 1, k + 1)

bin_edges = norm.ppf(probabilities, loc=mu_hat, scale=sigma_hat)

# Replace infinities
bin_edges[0] = -np.inf
bin_edges[-1] = np.inf

# Observed frequencies
observed, _ = np.histogram(x, bins=bin_edges)

# Expected frequencies
expected = np.ones(k) * len(x) / k

# Pearson chi-square statistic
chi_stat = np.sum((observed - expected) ** 2 / expected)

# Degrees of freedom:
# k bins - 1 constraint - 2 estimated parameters
df = k - 1 - 2

chi_p = 1 - stats.chi2.cdf(chi_stat, df=df)

# -----------------------------
# Create comparison table
# -----------------------------

results = pd.DataFrame({
    "Test": [
        "Jarque-Bera",
        "Shapiro-Wilk",
        "Anderson-Darling",
        "Pearson Chi-Square"
    ],
    "Statistic": [
        jb_stat,
        shapiro_stat,
        ad_stat,
        chi_stat
    ],
    "p-value / critical value": [
        jb_p,
        shapiro_p,
        f"5% critical value = {ad_critical_5:.4f}",
        chi_p
    ],
    "Decision at alpha=0.05": [
        "Reject H0" if jb_p < alpha else "Fail to reject H0",
        "Reject H0" if shapiro_p < alpha else "Fail to reject H0",
        ad_decision,
        "Reject H0" if chi_p < alpha else "Fail to reject H0"
    ]
})

results


# -----------------------------
# Print detailed output
# -----------------------------

print("Jarque-Bera test")
print("Statistic:", jb_stat)
print("p-value:", jb_p)
print()

print("Shapiro-Wilk test")
print("Statistic:", shapiro_stat)
print("p-value:", shapiro_p)
print()

print("Anderson-Darling test")
print("Statistic:", ad_stat)
print("Critical values:", ad_result.critical_values)
print("Significance levels:", ad_result.significance_level)
print("5% critical value:", ad_critical_5)
print()

print("Pearson Chi-Square test")
print("Statistic:", chi_stat)
print("Degrees of freedom:", df)
print("p-value:", chi_p)

Jarque-Bera test
Statistic: 1714.1306134960598
p-value: 0.0

Shapiro-Wilk test
Statistic: 0.8246256113052368
p-value: 1.0949680507146874e-31

Anderson-Darling test
Statistic: 46.51960676156841
Critical values: [0.574 0.653 0.784 0.914 1.088]
Significance levels: [15.  10.   5.   2.5  1. ]
5% critical value: 0.784

Pearson Chi-Square test
Statistic: 376.68
Degrees of freedom: 7
p-value: 0.0


# -----------------------------
# Plot histogram
# -----------------------------

plt.figure(figsize=(8, 5))

plt.hist(x, bins=30, density=True, alpha=0.6)

x_grid = np.linspace(min(x), max(x), 300)

normal_density = norm.pdf(
    x_grid,
    loc=mu_hat,
    scale=sigma_hat
)

plt.plot(
    x_grid,
    normal_density,
    linewidth=2,
    label="Fitted normal density"
)

plt.title("Histogram of Exponential Data")
plt.xlabel("x")
plt.ylabel("Density")

plt.legend()
plt.grid(True)
plt.show()


# -----------------------------
# QQ-plot
# -----------------------------

stats.probplot(x, dist="norm", plot=plt)

plt.title("QQ-Plot of Exponential Data")

plt.grid(True)
plt.show()


# -----------------------------
# Skewness and kurtosis
# -----------------------------

skewness = stats.skew(x)
excess_kurtosis = stats.kurtosis(x)

print("Sample skewness:", skewness)
print("Sample excess kurtosis:", excess_kurtosis)

Sample skewness: 1.865034222864843
Sample excess kurtosis: 5.2178275282033955


# =========================================
# CELL 1 — Import libraries
# =========================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import (
    jarque_bera,
    shapiro,
    anderson,
    norm,
    skew,
    kurtosis
)

import scipy.stats as stats


# =========================================
# CELL 2 — Load dataset
# =========================================

df = pd.read_csv("ds_salaries.csv")

df.head()


# =========================================
# CELL 3 — Inspect dataset
# =========================================

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          607 non-null    int64 
 1   work_year           607 non-null    int64 
 2   experience_level    607 non-null    object
 3   employment_type     607 non-null    object
 4   job_title           607 non-null    object
 5   salary              607 non-null    int64 
 6   salary_currency     607 non-null    object
 7   salary_in_usd       607 non-null    int64 
 8   employee_residence  607 non-null    object
 9   remote_ratio        607 non-null    int64 
 10  company_location    607 non-null    object
 11  company_size        607 non-null    object
dtypes: int64(5), object(7)
memory usage: 57.0+ KB


# =========================================
# CELL 4 — Column names
# =========================================

df.columns

Index(['Unnamed: 0', 'work_year', 'experience_level', 'employment_type',
       'job_title', 'salary', 'salary_currency', 'salary_in_usd',
       'employee_residence', 'remote_ratio', 'company_location',
       'company_size'],
      dtype='object')


# =========================================
# CELL 5 — Extract salary column
# =========================================

salary = df["salary_in_usd"]

salary.head()

0     79833
1    260000
2    109024
3     20000
4    150000
Name: salary_in_usd, dtype: int64


# =========================================
# CELL 6 — Clean the data
# =========================================

salary = salary.dropna()

salary = salary[salary > 0]

salary = salary.values

print("Sample size:", len(salary))

Sample size: 607


# =========================================
# CELL 7 — Basic statistics
# =========================================

print("Mean:", np.mean(salary))
print("Median:", np.median(salary))
print("Standard deviation:", np.std(salary, ddof=1))

Mean: 112297.86985172982
Median: 101570.0
Standard deviation: 70957.25941139569


# =========================================
# CELL 8 — Histogram of salaries
# =========================================

plt.figure(figsize=(10,6))

plt.hist(salary, bins=40, density=True)

plt.title("Histogram of Salaries")
plt.xlabel("Salary (USD)")
plt.ylabel("Density")

plt.grid(True)

plt.show()


# =========================================
# CELL 9 — QQ-plot
# =========================================

stats.probplot(salary, dist="norm", plot=plt)

plt.title("QQ-Plot of Salaries")

plt.grid(True)

plt.show()


# =========================================
# CELL 10 — Skewness and kurtosis
# =========================================

salary_skewness = skew(salary)

salary_kurtosis = kurtosis(salary)

print("Skewness:", salary_skewness)

print("Excess kurtosis:", salary_kurtosis)

Skewness: 1.6634213360977623
Excess kurtosis: 6.291709208027671


# =========================================
# CELL 11 — Jarque-Bera test
# =========================================

jb_stat, jb_p = jarque_bera(salary)

print("Jarque-Bera statistic:", jb_stat)

print("p-value:", jb_p)

Jarque-Bera statistic: 1281.1111067841746
p-value: 6.460376212006185e-279


# =========================================
# CELL 12 — Shapiro-Wilk test
# =========================================

shapiro_stat, shapiro_p = shapiro(salary)

print("Shapiro-Wilk statistic:", shapiro_stat)

print("p-value:", shapiro_p)

Shapiro-Wilk statistic: 0.8983551263809204
p-value: 1.254381529574248e-19


# =========================================
# CELL 13 — Anderson-Darling test
# =========================================

ad_result = anderson(salary, dist="norm")

print("Anderson-Darling statistic:")

print(ad_result.statistic)

print("\nCritical values:")

print(ad_result.critical_values)

print("\nSignificance levels:")

print(ad_result.significance_level)

Anderson-Darling statistic:
7.167722523512339

Critical values:
[0.572 0.652 0.782 0.912 1.085]

Significance levels:
[15.  10.   5.   2.5  1. ]


# =========================================
# CELL 14 — Pearson Chi-Square test
# =========================================

mu_hat = np.mean(salary)

sigma_hat = np.std(salary, ddof=1)

k = 10

probabilities = np.linspace(0, 1, k + 1)

bin_edges = norm.ppf(
    probabilities,
    loc=mu_hat,
    scale=sigma_hat
)

bin_edges[0] = -np.inf
bin_edges[-1] = np.inf

observed, _ = np.histogram(
    salary,
    bins=bin_edges
)

expected = np.ones(k) * len(salary) / k

chi_stat = np.sum(
    (observed - expected)**2 / expected
)

df_chi = k - 1 - 2

chi_p = 1 - stats.chi2.cdf(
    chi_stat,
    df=df_chi
)

print("Chi-square statistic:", chi_stat)

print("Degrees of freedom:", df_chi)

print("p-value:", chi_p)

Chi-square statistic: 36.80560131795716
Degrees of freedom: 7
p-value: 5.10556341737356e-06


# =========================================
# CELL 15 — Pearson observed vs expected
# =========================================

pearson_table = pd.DataFrame({
    "Bin": np.arange(1, k + 1),
    "Observed": observed,
    "Expected": expected
})

pearson_table


# =========================================
# CELL 16 — Summary table
# =========================================

alpha = 0.05

ad_critical_5 = ad_result.critical_values[
    list(ad_result.significance_level).index(5.0)
]

ad_decision = (
    "Reject H0"
    if ad_result.statistic > ad_critical_5
    else "Fail to reject H0"
)

results = pd.DataFrame({
    "Test": [
        "Jarque-Bera",
        "Shapiro-Wilk",
        "Anderson-Darling",
        "Pearson Chi-Square"
    ],
    "Statistic": [
        jb_stat,
        shapiro_stat,
        ad_result.statistic,
        chi_stat
    ],
    "p-value / critical value": [
        jb_p,
        shapiro_p,
        f"5% critical value = {ad_critical_5:.4f}",
        chi_p
    ],
    "Decision at alpha=0.05": [
        "Reject H0" if jb_p < alpha else "Fail to reject H0",
        "Reject H0" if shapiro_p < alpha else "Fail to reject H0",
        ad_decision,
        "Reject H0" if chi_p < alpha else "Fail to reject H0"
    ]
})

results


# ==========================================================
# NORMALITY ANALYSIS OF NHANES HEIGHT DATA
# ==========================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import (
    jarque_bera,
    shapiro,
    anderson,
    skew,
    kurtosis,
    norm
)

import scipy.stats as stats

# ----------------------------------------------------------
# 1. Load dataset
# ----------------------------------------------------------

df = pd.read_csv("NHANES Weight and Height.csv")

print("Columns:")
print(df.columns)

# ----------------------------------------------------------
# 2. Select height column
# ----------------------------------------------------------

# Adjust column name if necessary
# Common names: "Height", "Height_cm", etc.

height_col = [col for col in df.columns if "height" in col.lower()][0]

print("\nUsing column:", height_col)

x = df[height_col]

# ----------------------------------------------------------
# 3. Clean data
# ----------------------------------------------------------

x = x.dropna()

x = x[x > 0]

x = x.values

print("\nSample size:", len(x))

# ----------------------------------------------------------
# 4. Basic statistics
# ----------------------------------------------------------

print("\nBasic statistics")
print("----------------------")
print("Mean:", np.mean(x))
print("Median:", np.median(x))
print("Standard deviation:", np.std(x, ddof=1))

# ----------------------------------------------------------
# 5. Histogram
# ----------------------------------------------------------

plt.figure(figsize=(10,6))

plt.hist(x, bins=30, density=True, alpha=0.6)

mu_hat = np.mean(x)
sigma_hat = np.std(x, ddof=1)

x_grid = np.linspace(min(x), max(x), 500)

normal_density = norm.pdf(
    x_grid,
    loc=mu_hat,
    scale=sigma_hat
)

plt.plot(
    x_grid,
    normal_density,
    linewidth=2,
    label="Fitted normal density"
)

plt.title("Histogram of Heights")
plt.xlabel("Height")
plt.ylabel("Density")

plt.legend()
plt.grid(True)

plt.show()

# ----------------------------------------------------------
# 6. QQ-plot
# ----------------------------------------------------------

stats.probplot(x, dist="norm", plot=plt)

plt.title("QQ-Plot of Heights")

plt.grid(True)

plt.show()

# ----------------------------------------------------------
# 7. Skewness and kurtosis
# ----------------------------------------------------------

sample_skewness = skew(x)

sample_kurtosis = kurtosis(x)

print("\nSkewness and Kurtosis")
print("----------------------")
print("Skewness:", sample_skewness)
print("Excess kurtosis:", sample_kurtosis)

# ----------------------------------------------------------
# 8. Jarque-Bera test
# ----------------------------------------------------------

jb_stat, jb_p = jarque_bera(x)

# ----------------------------------------------------------
# 9. Shapiro-Wilk test
# ----------------------------------------------------------

# Shapiro can fail for very large datasets
# Use a random subsample if necessary

if len(x) > 5000:
    x_shapiro = np.random.choice(x, 5000, replace=False)
else:
    x_shapiro = x

shapiro_stat, shapiro_p = shapiro(x_shapiro)

# ----------------------------------------------------------
# 10. Anderson-Darling test
# ----------------------------------------------------------

ad_result = anderson(x, dist="norm")

ad_stat = ad_result.statistic

ad_critical_5 = ad_result.critical_values[
    list(ad_result.significance_level).index(5.0)
]

ad_decision = (
    "Reject H0"
    if ad_stat > ad_critical_5
    else "Fail to reject H0"
)

# ----------------------------------------------------------
# 11. Pearson Chi-Square test
# ----------------------------------------------------------

k = 10

probabilities = np.linspace(0, 1, k + 1)

bin_edges = norm.ppf(
    probabilities,
    loc=mu_hat,
    scale=sigma_hat
)

bin_edges[0] = -np.inf
bin_edges[-1] = np.inf

observed, _ = np.histogram(
    x,
    bins=bin_edges
)

expected = np.ones(k) * len(x) / k

chi_stat = np.sum(
    (observed - expected)**2 / expected
)

df_chi = k - 1 - 2

chi_p = 1 - stats.chi2.cdf(
    chi_stat,
    df=df_chi
)

# ----------------------------------------------------------
# 12. Results table
# ----------------------------------------------------------

alpha = 0.05

results = pd.DataFrame({
    "Test": [
        "Jarque-Bera",
        "Shapiro-Wilk",
        "Anderson-Darling",
        "Pearson Chi-Square"
    ],
    "Statistic": [
        jb_stat,
        shapiro_stat,
        ad_stat,
        chi_stat
    ],
    "p-value / critical value": [
        jb_p,
        shapiro_p,
        f"5% critical value = {ad_critical_5:.4f}",
        chi_p
    ],
    "Decision at alpha=0.05": [
        "Reject H0" if jb_p < alpha else "Fail to reject H0",
        "Reject H0" if shapiro_p < alpha else "Fail to reject H0",
        ad_decision,
        "Reject H0" if chi_p < alpha else "Fail to reject H0"
    ]
})

print("\nNormality Test Results")
print("----------------------")

display(results)

# ----------------------------------------------------------
# 13. Pearson observed vs expected table
# ----------------------------------------------------------

pearson_table = pd.DataFrame({
    "Bin": np.arange(1, k + 1),
    "Observed": observed,
    "Expected": expected
})

print("\nObserved vs Expected Frequencies")
print("--------------------------------")

display(pearson_table)

Columns:
Index(['Unnamed: 0', 'Weight (kg)', 'Standing Height (cm)', 'BMI(kg/m**2)'], dtype='object')

Using column: Standing Height (cm)

Sample size: 8388

Basic statistics
----------------------
Mean: 166.64118979494518
Median: 166.2
Standard deviation: 10.079013460239327

Skewness and Kurtosis
----------------------
Skewness: 0.14247082433871722
Excess kurtosis: -0.41736992735109846

Normality Test Results
----------------------

Observed vs Expected Frequencies
--------------------------------

Distribution	Excess Kurtosis
Normal	0
Heavy-tailed	Positive
Light-tailed	Negative

Method	Type
Histogram	Visual
QQ-plot	Visual
Shapiro–Wilk	Formal test
Anderson–Darling	Formal test
Kolmogorov–Smirnov	Formal test
Pearson Chi-Square	Formal test

Method	Main Idea	Strengths	Weaknesses
Histogram	Visual shape	Simple	Subjective
QQ-plot	Compare quantiles	Very informative	Subjective
Pearson Chi-Square	Compare frequencies	Classical	Depends on bins
KS/Lilliefors	Compare CDFs	Simple	Weak power
Anderson–Darling	Tail-sensitive EDF test	Strong tails	Slightly more complex
Shapiro–Wilk	Order statistics	Best general power	Sensitive for huge n

Situation	Recommended Method
Small/moderate sample	Shapiro–Wilk
Tail behavior important	Anderson–Darling
Visual diagnosis	QQ-plot
Classical teaching	Pearson Chi-Square
Distribution-free EDF idea	Lilliefors

Method	Type
Histogram	Visual
QQ-plot	Visual
Shapiro–Wilk	Formal test
Anderson–Darling	Formal test
Lilliefors	Formal test
Pearson Chi-Square	Formal test

	Test	Statistic	p-value / critical value	Decision at alpha=0.05
0	Jarque-Bera	2.456373	0.292823	Fail to reject H0
1	Shapiro-Wilk	0.998608	0.626629	Fail to reject H0
2	Anderson-Darling	0.347470	5% critical value = 0.7840	Fail to reject H0
3	Pearson Chi-Square	7.700000	0.359789	Fail to reject H0

	Bin	Observed	Expected
0	1	103	100.0
1	2	97	100.0
2	3	113	100.0
3	4	94	100.0
4	5	91	100.0
5	6	111	100.0
6	7	98	100.0
7	8	106	100.0
8	9	83	100.0
9	10	104	100.0

	Test	Statistic	p-value / critical value	Decision at alpha=0.05
0	Jarque-Bera	1714.130613	0.0	Reject H0
1	Shapiro-Wilk	0.824626	0.0	Reject H0
2	Anderson-Darling	46.519607	5% critical value = 0.7840	Reject H0
3	Pearson Chi-Square	376.680000	0.0	Reject H0

	Unnamed: 0	work_year	experience_level	employment_type	job_title	salary	salary_currency	salary_in_usd	employee_residence	remote_ratio	company_location	company_size
0	0	2020	MI	FT	Data Scientist	70000	EUR	79833	DE	0	DE	L
1	1	2020	SE	FT	Machine Learning Scientist	260000	USD	260000	JP	0	JP	S
2	2	2020	SE	FT	Big Data Engineer	85000	GBP	109024	GB	50	GB	M
3	3	2020	MI	FT	Product Data Analyst	20000	USD	20000	HN	0	HN	S
4	4	2020	SE	FT	Machine Learning Engineer	150000	USD	150000	US	50	US	L

	Bin	Observed	Expected
0	1	36	60.7
1	2	80	60.7
2	3	79	60.7
3	4	76	60.7
4	5	66	60.7
5	6	65	60.7
6	7	46	60.7
7	8	64	60.7
8	9	42	60.7
9	10	53	60.7

	Test	Statistic	p-value / critical value	Decision at alpha=0.05
0	Jarque-Bera	1281.111107	0.0	Reject H0
1	Shapiro-Wilk	0.898355	0.0	Reject H0
2	Anderson-Darling	7.167723	5% critical value = 0.7820	Reject H0
3	Pearson Chi-Square	36.805601	0.000005	Reject H0

	Test	Statistic	p-value / critical value	Decision at alpha=0.05
0	Jarque-Bera	89.258595	0.0	Reject H0
1	Shapiro-Wilk	0.994761	0.0	Reject H0
2	Anderson-Darling	10.993311	5% critical value = 0.7870	Reject H0
3	Pearson Chi-Square	78.161183	0.0	Reject H0

	Bin	Observed	Expected
0	1	820	838.8
1	2	1010	838.8
2	3	888	838.8
3	4	816	838.8
4	5	780	838.8
5	6	736	838.8
6	7	756	838.8
7	8	780	838.8
8	9	880	838.8
9	10	922	838.8

Seminar 14

Testing Normality of a Sample¶

Skewness and Kurtosis¶

1. Introduction¶

skewness

3.1 Definition¶

3.2 Why Divide by $\sigma^3$?¶

3.3 Interpretation¶

Symmetric distribution¶

Positive skewness¶

Negative skewness¶

3.4 Why the Third Power Detects Asymmetry¶

3.6 Sample Skewness¶

Kurtosis

4.1 Definition¶

4.2 Excess Kurtosis¶

4.3 Interpretation¶

4.4 Why the Fourth Power Matters¶

4.7 Sample Kurtosis¶

5. Relationship with Normality¶

Testing Normality of a Sample

1. Introduction¶

2. Important Philosophy¶

3. Histogram¶

4. QQ-Plot (Quantile-Quantile Plot)¶

4.1 Main Idea¶

4.2 Ordered Sample¶

4.3 Theoretical Normal Quantiles¶

4.4 Constructing the QQ-Plot¶

4.5 Interpretation¶

Approximately linear¶

S-shaped curve¶

Curved tails¶

Extreme deviations at ends¶

4.6 Why QQ-Plots are Important¶

5. Pearson Chi-Square Test for Normality¶

5.1 Main Idea¶

5.2 Step 1: Create Intervals¶

5.3 Step 2: Observed Frequencies¶

5.4 Step 3: Estimate Parameters¶

5.5 Step 4: Expected Frequencies¶

5.6 Test Statistic¶

5.7 Degrees of Freedom¶

5.8 Limitations¶

8. Anderson–Darling Test¶

8.1 Main Idea¶

8.2 Test Statistic¶

8.3 Why It Is Powerful¶

8.4 Advantages¶

9. Shapiro–Wilk Test¶

9.1 Main Idea¶

9.2 Ordered Sample¶

9.3 Test Statistic¶

9.4 Interpretation¶

9.5 Why It Works Well¶

9.6 Advantages¶

9.7 Limitations¶

10. Comparison of Methods¶

11. Which Test Should Be Used?¶

12. Important Practical Advice¶

13. Normality in Regression¶

14. Summary¶

Jarque–Bera Test

6.2 Test Statistic¶

Example 2 — Test for Non-Normal Data¶

Salary Data¶