Statistical plots#
Histogram#
Use quad()
glyphs to create a histogram plotted from np.histogram
output
import numpy as np
from bokeh.plotting import figure, show
rng = np.random.default_rng()
x = rng.normal(loc=0, scale=1, size=1000)
p = figure(width=670, height=400, toolbar_location=None,
title="Normal (Gaussian) Distribution")
# Histogram
bins = np.linspace(-3, 3, 40)
hist, edges = np.histogram(x, density=True, bins=bins)
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
fill_color="skyblue", line_color="white",
legend_label="1000 random samples")
# Probability density function
x = np.linspace(-3.0, 3.0, 100)
pdf = np.exp(-0.5*x**2) / np.sqrt(2.0*np.pi)
p.line(x, pdf, line_width=2, line_color="navy",
legend_label="Probability Density Function")
p.y_range.start = 0
p.xaxis.axis_label = "x"
p.yaxis.axis_label = "PDF(x)"
show(p)
Boxplot#
Box plots can be assembled using Whisker
annotations, vbar()
and scatter()
glyphs:
import pandas as pd
from bokeh.models import ColumnDataSource, Whisker
from bokeh.plotting import figure, show
from bokeh.sampledata.autompg2 import autompg2
from bokeh.transform import factor_cmap
df = autompg2[["class", "hwy"]].rename(columns={"class": "kind"})
kinds = df.kind.unique()
# compute quantiles
qs = df.groupby("kind").hwy.quantile([0.25, 0.5, 0.75])
qs = qs.unstack().reset_index()
qs.columns = ["kind", "q1", "q2", "q3"]
df = pd.merge(df, qs, on="kind", how="left")
# compute IQR outlier bounds
iqr = df.q3 - df.q1
df["upper"] = df.q3 + 1.5*iqr
df["lower"] = df.q1 - 1.5*iqr
source = ColumnDataSource(df)
p = figure(x_range=kinds, tools="", toolbar_location=None,
title="Highway MPG distribution by vehicle class",
background_fill_color="#eaefef", y_axis_label="MPG")
# outlier range
whisker = Whisker(base="kind", upper="upper", lower="lower", source=source)
whisker.upper_head.size = whisker.lower_head.size = 20
p.add_layout(whisker)
# quantile boxes
cmap = factor_cmap("kind", "TolRainbow7", kinds)
p.vbar("kind", 0.7, "q2", "q3", source=source, color=cmap, line_color="black")
p.vbar("kind", 0.7, "q1", "q2", source=source, color=cmap, line_color="black")
# outliers
outliers = df[~df.hwy.between(df.lower, df.upper)]
p.scatter("kind", "hwy", source=outliers, size=6, color="black", alpha=0.3)
p.xgrid.grid_line_color = None
p.axis.major_label_text_font_size="14px"
p.axis.axis_label_text_font_size="12px"
show(p)
Kernel density estimation#
import numpy as np
from scipy.stats import gaussian_kde
from bokeh.palettes import Blues9
from bokeh.plotting import figure, show
from bokeh.sampledata.autompg import autompg as df
def kde(x, y, N):
xmin, xmax = x.min(), x.max()
ymin, ymax = y.min(), y.max()
X, Y = np.mgrid[xmin:xmax:N*1j, ymin:ymax:N*1j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([x, y])
kernel = gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
return X, Y, Z
x, y, z = kde(df.hp, df.mpg, 300)
p = figure(height=400, x_axis_label="hp", y_axis_label="mpg",
background_fill_color="#fafafa", tools="", toolbar_location=None,
title="Kernel density estimation plot of HP vs MPG")
p.grid.level = "overlay"
p.grid.grid_line_color = "black"
p.grid.grid_line_alpha = 0.05
palette = Blues9[::-1]
levels = np.linspace(np.min(z), np.max(z), 10)
p.contour(x, y, z, levels[1:], fill_color=palette, line_color=palette)
show(p)
SPLOM#
A SPLOM is “scatter plot matrix” that arranges multiple scatter plots in a grid fashion in order to highlight correlations between dimensions. Key components of a SPLOM are Linked panning and Linked brushing as demonstrated in this example:
from itertools import product
from bokeh.io import show
from bokeh.layouts import gridplot
from bokeh.models import (BasicTicker, Circle, ColumnDataSource,
DataRange1d, Grid, LassoSelectTool, LinearAxis,
PanTool, Plot, ResetTool, WheelZoomTool)
from bokeh.sampledata.penguins import data
from bokeh.transform import factor_cmap
df = data.copy()
df["body_mass_kg"] = df["body_mass_g"] / 1000
SPECIES = sorted(df.species.unique())
ATTRS = ("bill_length_mm", "bill_depth_mm", "body_mass_kg")
N = len(ATTRS)
source = ColumnDataSource(data=df)
xdrs = [DataRange1d(bounds=None) for _ in range(N)]
ydrs = [DataRange1d(bounds=None) for _ in range(N)]
plots = []
for i, (y, x) in enumerate(product(ATTRS, reversed(ATTRS))):
p = Plot(x_range=xdrs[i%N], y_range=ydrs[i//N],
background_fill_color="#fafafa",
border_fill_color="white", width=200, height=200, min_border=5)
if i % N == 0: # first column
p.min_border_left = p.min_border + 4
p.width += 40
yaxis = LinearAxis(axis_label=y)
yaxis.major_label_orientation = "vertical"
p.add_layout(yaxis, "left")
yticker = yaxis.ticker
else:
yticker = BasicTicker()
p.add_layout(Grid(dimension=1, ticker=yticker))
if i >= N*(N-1): # last row
p.min_border_bottom = p.min_border + 40
p.height += 40
xaxis = LinearAxis(axis_label=x)
p.add_layout(xaxis, "below")
xticker = xaxis.ticker
else:
xticker = BasicTicker()
p.add_layout(Grid(dimension=0, ticker=xticker))
circle = Circle(x=x, y=y, fill_alpha=0.6, size=5, line_color=None,
fill_color=factor_cmap('species', 'Category10_3', SPECIES))
r = p.add_glyph(source, circle)
p.x_range.renderers.append(r)
p.y_range.renderers.append(r)
# suppress the diagonal
if (i%N) + (i//N) == N-1:
r.visible = False
p.grid.grid_line_color = None
p.add_tools(PanTool(), WheelZoomTool(), ResetTool(), LassoSelectTool())
plots.append(p)
show(gridplot(plots, ncols=N))