# Statistical plots#

## Histogram#

Use `quad()` glyphs to create a histogram plotted from `np.histogram` output

```import numpy as np

from bokeh.plotting import figure, show

rng = np.random.default_rng()
x = rng.normal(loc=0, scale=1, size=1000)

p = figure(width=670, height=400, toolbar_location=None,
title="Normal (Gaussian) Distribution")

# Histogram
bins = np.linspace(-3, 3, 40)
hist, edges = np.histogram(x, density=True, bins=bins)
fill_color="skyblue", line_color="white",
legend_label="1000 random samples")

# Probability density function
x = np.linspace(-3.0, 3.0, 100)
pdf = np.exp(-0.5*x**2) / np.sqrt(2.0*np.pi)
p.line(x, pdf, line_width=2, line_color="navy",
legend_label="Probability Density Function")

p.y_range.start = 0
p.xaxis.axis_label = "x"
p.yaxis.axis_label = "PDF(x)"

show(p)
```

A population pyramid plot is a divergent horizontal bar plot that can be used to compare distributions between two groups. In Bokeh they can be created using `hbar()` glyphs.

```import numpy as np

from bokeh.models import CustomJSTickFormatter, Label
from bokeh.palettes import DarkText, Vibrant3 as colors
from bokeh.plotting import figure, show
from bokeh.sampledata.titanic import data as df

sex_group = df.groupby("sex")

female_ages = sex_group.get_group("female")["age"].dropna()
male_ages = sex_group.get_group("male")["age"].dropna()

bin_width = 5
bins = np.arange(0, 72, bin_width)
m_hist, edges = np.histogram(male_ages, bins=bins)
f_hist, edges = np.histogram(female_ages, bins=bins)

p = figure(title="Age population pyramid of titanic passengers, by gender", height=400, width=600,
x_range=(-90, 90), x_axis_label="count")

p.hbar(right=f_hist, y=edges[1:], height=bin_width*0.8, color=colors[0], line_width=0)

p.hbar(right=m_hist * -1, y=edges[1:], height=bin_width*0.8, color=colors[1], line_width=0)

# add text to every other bar
for i, (count, age) in enumerate(zip(f_hist, edges[1:])):
if i % 2 == 1:
continue
p.text(x=count, y=edges[1:][i], text=[f"{age-bin_width}-{age}yrs"],
x_offset=5, y_offset=7, text_font_size="12px", text_color=DarkText[5])

# customise x-axis and y-axis
p.xaxis.ticker = (-80, -60, -40, -20, 0, 20, 40, 60, 80)
p.xaxis.major_tick_out = 0
p.y_range.start = 3
p.ygrid.grid_line_color = None
p.yaxis.visible = False

# format tick labels as absolute values for the two-sided plot
p.xaxis.formatter = CustomJSTickFormatter(code="return Math.abs(tick);")

show(p)
```

## Boxplot#

Box plots can be assembled using `Whisker` annotations, `vbar()` and `scatter()` glyphs:

```import pandas as pd

from bokeh.models import ColumnDataSource, Whisker
from bokeh.plotting import figure, show
from bokeh.sampledata.autompg2 import autompg2
from bokeh.transform import factor_cmap

df = autompg2[["class", "hwy"]].rename(columns={"class": "kind"})

kinds = df.kind.unique()

# compute quantiles
qs = df.groupby("kind").hwy.quantile([0.25, 0.5, 0.75])
qs = qs.unstack().reset_index()
qs.columns = ["kind", "q1", "q2", "q3"]

# compute IQR outlier bounds
iqr = qs.q3 - qs.q1
qs["upper"] = qs.q3 + 1.5*iqr
qs["lower"] = qs.q1 - 1.5*iqr
df = pd.merge(df, qs, on="kind", how="left")

source = ColumnDataSource(qs)

p = figure(x_range=kinds, tools="", toolbar_location=None,
title="Highway MPG distribution by vehicle class",
background_fill_color="#eaefef", y_axis_label="MPG")

# outlier range
whisker = Whisker(base="kind", upper="upper", lower="lower", source=source)

# quantile boxes
cmap = factor_cmap("kind", "TolRainbow7", kinds)
p.vbar("kind", 0.7, "q2", "q3", source=source, color=cmap, line_color="black")
p.vbar("kind", 0.7, "q1", "q2", source=source, color=cmap, line_color="black")

# outliers
outliers = df[~df.hwy.between(df.lower, df.upper)]
p.scatter("kind", "hwy", source=outliers, size=6, color="black", alpha=0.3)

p.xgrid.grid_line_color = None
p.axis.major_label_text_font_size="14px"
p.axis.axis_label_text_font_size="12px"

show(p)
```

## Kernel density estimation#

```import numpy as np
from scipy.stats import gaussian_kde

from bokeh.palettes import Blues9
from bokeh.plotting import figure, show
from bokeh.sampledata.autompg import autompg as df

def kde(x, y, N):
xmin, xmax = x.min(), x.max()
ymin, ymax = y.min(), y.max()

X, Y = np.mgrid[xmin:xmax:N*1j, ymin:ymax:N*1j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([x, y])
kernel = gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)

return X, Y, Z

x, y, z = kde(df.hp, df.mpg, 300)

p = figure(height=400, x_axis_label="hp", y_axis_label="mpg",
background_fill_color="#fafafa", tools="", toolbar_location=None,
title="Kernel density estimation plot of HP vs MPG")
p.grid.level = "overlay"
p.grid.grid_line_color = "black"
p.grid.grid_line_alpha = 0.05

palette = Blues9[::-1]
levels = np.linspace(np.min(z), np.max(z), 10)
p.contour(x, y, z, levels[1:], fill_color=palette, line_color=palette)

show(p)
```

Kernel density estimates can also be plotted using `varea()` glyphs:

```import numpy as np
from sklearn.neighbors import KernelDensity

from bokeh.models import ColumnDataSource, Label, PrintfTickFormatter
from bokeh.palettes import Dark2_5 as colors
from bokeh.plotting import figure, show
from bokeh.sampledata.cows import data as df

breed_groups = df.groupby('breed')
x = np.linspace(2, 8, 1000)
source = ColumnDataSource(dict(x=x))

p = figure(title="Multiple density estimates", height=300, x_range=(2.5, 7.5), x_axis_label="butterfat contents", y_axis_label="density")

for (breed, breed_df), color in zip(breed_groups, colors):
data = breed_df['butterfat'].values
kde = KernelDensity(kernel="gaussian", bandwidth=0.2).fit(data[:, np.newaxis])
log_density = kde.score_samples(x[:, np.newaxis])
y = np.exp(log_density)
p.varea(x="x", y1=breed, y2=0, source=source, fill_alpha=0.3, fill_color=color)

# Find the highest point and annotate with a label
max_idx = np.argmax(y)
highest_point_label = Label(
x=x[max_idx],
y=y[max_idx],
text=breed,
text_font_size="10pt",
x_offset=10,
y_offset=-5,
text_color=color,
)

# Display x-axis labels as percentages
p.xaxis.formatter = PrintfTickFormatter(format="%d%%")

p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.minor_tick_line_color = None

p.xgrid.grid_line_color = None
p.yaxis.ticker = (0, 0.5, 1, 1.5)
p.y_range.start = 0

show(p)
```

## SinaPlot#

SinaPlots can be assembled using the `harea()` and `scatter()` glyphs:

```import numpy as np
import pandas as pd
from sklearn.neighbors import KernelDensity

from bokeh.plotting import figure, show
from bokeh.sampledata.lincoln import data as df

df["DATE"] = pd.to_datetime(df["DATE"])
df["TAVG"] = (df["TMAX"] + df["TMIN"]) / 2
df["MONTH"] = df.DATE.dt.strftime("%b")

months = list(df.MONTH.unique())

p = figure(
height=400,
width=600,
x_range=months,
x_axis_label="month",
y_axis_label="mean temperature (F)",
)

# add a non-uniform categorical offset to a given category
def offset(category, data, scale=7):
return list(zip([category] * len(data), scale * data))

for month in months:
month_df = df[df.MONTH == month].dropna()
tavg = month_df.TAVG.values
temps = np.linspace(tavg.min(), tavg.max(), 50)

kde = KernelDensity(kernel="gaussian", bandwidth=3).fit(tavg[:, np.newaxis])
density = np.exp(kde.score_samples(temps[:, np.newaxis]))
x1, x2 = offset(month, density), offset(month, -density)

p.harea(x1=x1, x2=x2, y=temps, alpha=0.8, color="#E0E0E0")

# pre-compute jitter in Python, this case is too complex for BokehJS
tavg_density = np.exp(kde.score_samples(tavg[:, np.newaxis]))
jitter = (np.random.random(len(tavg)) * 2 - 1) * tavg_density

p.scatter(x=offset(month, jitter), y=tavg, color="black")

p.y_range.start = -10
p.yaxis.ticker = [0, 25, 50, 75]
p.grid.grid_line_color = None

show(p)
```

## SPLOM#

A SPLOM is “scatter plot matrix” that arranges multiple scatter plots in a grid fashion in order to highlight correlations between dimensions. Key components of a SPLOM are Linked panning and Linked brushing as demonstrated in this example:

```from itertools import product

from bokeh.io import show
from bokeh.layouts import gridplot
from bokeh.models import (BasicTicker, ColumnDataSource, DataRange1d,
Grid, LassoSelectTool, LinearAxis, PanTool,
Plot, ResetTool, Scatter, WheelZoomTool)
from bokeh.sampledata.penguins import data
from bokeh.transform import factor_cmap

df = data.copy()
df["body_mass_kg"] = df["body_mass_g"] / 1000

SPECIES = sorted(df.species.unique())
ATTRS = ("bill_length_mm", "bill_depth_mm", "body_mass_kg")
N = len(ATTRS)

source = ColumnDataSource(data=df)

xdrs = [DataRange1d(bounds=None) for _ in range(N)]
ydrs = [DataRange1d(bounds=None) for _ in range(N)]

plots = []

for i, (y, x) in enumerate(product(ATTRS, reversed(ATTRS))):
p = Plot(x_range=xdrs[i%N], y_range=ydrs[i//N],
background_fill_color="#fafafa",
border_fill_color="white", width=200, height=200, min_border=5)

if i % N == 0:  # first column
p.min_border_left = p.min_border + 4
p.width += 40
yaxis = LinearAxis(axis_label=y)
yaxis.major_label_orientation = "vertical"
yticker = yaxis.ticker
else:
yticker = BasicTicker()

if i >= N*(N-1):  # last row
p.min_border_bottom = p.min_border + 40
p.height += 40
xaxis = LinearAxis(axis_label=x)
xticker = xaxis.ticker
else:
xticker = BasicTicker()

scatter = Scatter(x=x, y=y, fill_alpha=0.6, size=5, line_color=None,
fill_color=factor_cmap('species', 'Category10_3', SPECIES))
p.x_range.renderers.append(r)
p.y_range.renderers.append(r)

# suppress the diagonal
if (i%N) + (i//N) == N-1:
r.visible = False
p.grid.grid_line_color = None