sinaplot#

A kernel density estimation plot of the “lincoln” data using the sklearn.neighbors.KernelDensity function and Bokeh harea glyph

Details

Sampledata:

bokeh.sampledata.lincoln

Bokeh APIs:

figure.harea

More info:

SinaPlot

Keywords:

jitter, scatter, sinaplot

import numpy as np
import pandas as pd
from sklearn.neighbors import KernelDensity

from bokeh.plotting import figure, show
from bokeh.sampledata.lincoln import data as df

df["DATE"] = pd.to_datetime(df["DATE"])
df["TAVG"] = (df["TMAX"] + df["TMIN"]) / 2
df["MONTH"] = df.DATE.dt.strftime("%b")

months = list(df.MONTH.unique())

p = figure(
    height=400,
    width=600,
    x_range=months,
    x_axis_label="month",
    y_axis_label="mean temperature (F)",
)

# add a non-uniform categorical offset to a given category
def offset(category, data, scale=7):
    return list(zip([category] * len(data), scale * data))


for month in months:
    month_df = df[df.MONTH == month].dropna()
    tavg = month_df.TAVG.values
    temps = np.linspace(tavg.min(), tavg.max(), 50)

    kde = KernelDensity(kernel="gaussian", bandwidth=3).fit(tavg[:, np.newaxis])
    density = np.exp(kde.score_samples(temps[:, np.newaxis]))
    x1, x2 = offset(month, density), offset(month, -density)

    p.harea(x1=x1, x2=x2, y=temps, alpha=0.8, color="#E0E0E0")

    # pre-compute jitter in Python, this case is too complex for BokehJS
    tavg_density = np.exp(kde.score_samples(tavg[:, np.newaxis]))
    jitter = (np.random.random(len(tavg)) * 2 - 1) * tavg_density

    p.scatter(x=offset(month, jitter), y=tavg, color="black")

p.y_range.start = -10
p.yaxis.ticker = [0, 25, 50, 75]
p.grid.grid_line_color = None

show(p)