python - Matplotlibの平行座標プロット

Question

2次元および3次元のデータは、従来のプロットタイプを使用して比較的簡単に表示できます。4次元データでも、データを表示する方法を見つけることがよくあります。ただし、4を超える寸法は、表示がますます困難になります。幸いなことに、平行座標プロットは、より高い次元で結果を表示するためのメカニズムを提供します。

ウィキペディアの平行座標プロットの例

Matlab、R、VTKタイプ1、VTKタイプ2など、いくつかのプロットパッケージは平行座標プロットを提供しますが、Matplotlibを使用して作成する方法がわかりません。

Matplotlibに組み込みの平行座標プロットはありますか？確かにギャラリーには見当たりません。
組み込み型がない場合、Matplotlibの標準機能を使用して平行座標プロットを作成することは可能ですか？

編集：

以下のZhenyaの回答に基づいて、任意の数の軸をサポートする次の一般化を開発しました。上記の元の質問で投稿した例のプロットスタイルに従って、各軸は独自のスケールを取得します。これは、各軸ポイントでデータを正規化し、軸の範囲を0〜1にすることで実現しました。次に、戻って、その切片で正しい値を示すラベルを各目盛りに適用します。

この関数は、反復可能なデータセットを受け入れることによって機能します。各データセットは、各ポイントが異なる軸上にあるポイントのセットと見なされます。の例で__main__は、30行の2セットで各軸のランダムな数値を取得します。線は、線のクラスタリングを引き起こす範囲内でランダムです。確認したい動作。

このソリューションは、マウスの動作がおかしく、ラベルを介してデータ範囲を偽造しているため、組み込みソリューションほど良くありませんが、Matplotlibが組み込みソリューションを追加するまでは、許容範囲内です。

#!/usr/bin/python
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

def parallel_coordinates(data_sets, style=None):

    dims = len(data_sets[0])
    x    = range(dims)
    fig, axes = plt.subplots(1, dims-1, sharey=False)

    if style is None:
        style = ['r-']*len(data_sets)

    # Calculate the limits on the data
    min_max_range = list()
    for m in zip(*data_sets):
        mn = min(m)
        mx = max(m)
        if mn == mx:
            mn -= 0.5
            mx = mn + 1.
        r  = float(mx - mn)
        min_max_range.append((mn, mx, r))

    # Normalize the data sets
    norm_data_sets = list()
    for ds in data_sets:
        nds = [(value - min_max_range[dimension][0]) / 
                min_max_range[dimension][2] 
                for dimension,value in enumerate(ds)]
        norm_data_sets.append(nds)
    data_sets = norm_data_sets

    # Plot the datasets on all the subplots
    for i, ax in enumerate(axes):
        for dsi, d in enumerate(data_sets):
            ax.plot(x, d, style[dsi])
        ax.set_xlim([x[i], x[i+1]])

    # Set the x axis ticks 
    for dimension, (axx,xx) in enumerate(zip(axes, x[:-1])):
        axx.xaxis.set_major_locator(ticker.FixedLocator([xx]))
        ticks = len(axx.get_yticklabels())
        labels = list()
        step = min_max_range[dimension][2] / (ticks - 1)
        mn   = min_max_range[dimension][0]
        for i in xrange(ticks):
            v = mn + i*step
            labels.append('%4.2f' % v)
        axx.set_yticklabels(labels)


    # Move the final axis' ticks to the right-hand side
    axx = plt.twinx(axes[-1])
    dimension += 1
    axx.xaxis.set_major_locator(ticker.FixedLocator([x[-2], x[-1]]))
    ticks = len(axx.get_yticklabels())
    step = min_max_range[dimension][2] / (ticks - 1)
    mn   = min_max_range[dimension][0]
    labels = ['%4.2f' % (mn + i*step) for i in xrange(ticks)]
    axx.set_yticklabels(labels)

    # Stack the subplots 
    plt.subplots_adjust(wspace=0)

    return plt


if __name__ == '__main__':
    import random
    base  = [0,   0,  5,   5,  0]
    scale = [1.5, 2., 1.0, 2., 2.]
    data = [[base[x] + random.uniform(0., 1.)*scale[x]
            for x in xrange(5)] for y in xrange(30)]
    colors = ['r'] * 30

    base  = [3,   6,  0,   1,  3]
    scale = [1.5, 2., 2.5, 2., 2.]
    data.extend([[base[x] + random.uniform(0., 1.)*scale[x]
                 for x in xrange(5)] for y in xrange(30)])
    colors.extend(['b'] * 30)

    parallel_coordinates(data, style=colors).show()

編集2：

これは、フィッシャーのアイリスデータをプロットするときに上記のコードから得られるものの例です。ウィキペディアの参照画像ほど良くはありませんが、Matplotlibだけがあり、多次元プロットが必要な場合は問題ありません。

この回答からの平行座標プロットの結果の例

score 56 · Accepted Answer

pandasには平行座標ラッパーがあります：

import pandas
import matplotlib.pyplot as plt
from pandas.tools.plotting import parallel_coordinates

data = pandas.read_csv(r'C:\Python27\Lib\site-packages\pandas\tests\data\iris.csv', sep=',')
parallel_coordinates(data, 'Name')
plt.show()

スクリーンショット

ソースコード、その作成方法：plotting.py＃L494

score 19 · Accepted Answer

関連する質問に答えるとき、私は1つのサブプロットのみを使用して（他のプロットと簡単に組み合わせることができるように）バージョンを作成し、オプションで3次ベジェ曲線を使用してポイントを接続しました。プロットは、必要な軸数に合わせて調整されます。

import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches
import numpy as np

fig, host = plt.subplots()

# create some dummy data
ynames = ['P1', 'P2', 'P3', 'P4', 'P5']
N1, N2, N3 = 10, 5, 8
N = N1 + N2 + N3
category = np.concatenate([np.full(N1, 1), np.full(N2, 2), np.full(N3, 3)])
y1 = np.random.uniform(0, 10, N) + 7 * category
y2 = np.sin(np.random.uniform(0, np.pi, N)) ** category
y3 = np.random.binomial(300, 1 - category / 10, N)
y4 = np.random.binomial(200, (category / 6) ** 1/3, N)
y5 = np.random.uniform(0, 800, N)

# organize the data
ys = np.dstack([y1, y2, y3, y4, y5])[0]
ymins = ys.min(axis=0)
ymaxs = ys.max(axis=0)
dys = ymaxs - ymins
ymins -= dys * 0.05  # add 5% padding below and above
ymaxs += dys * 0.05
dys = ymaxs - ymins

# transform all data to be compatible with the main axis
zs = np.zeros_like(ys)
zs[:, 0] = ys[:, 0]
zs[:, 1:] = (ys[:, 1:] - ymins[1:]) / dys[1:] * dys[0] + ymins[0]


axes = [host] + [host.twinx() for i in range(ys.shape[1] - 1)]
for i, ax in enumerate(axes):
    ax.set_ylim(ymins[i], ymaxs[i])
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    if ax != host:
        ax.spines['left'].set_visible(False)
        ax.yaxis.set_ticks_position('right')
        ax.spines["right"].set_position(("axes", i / (ys.shape[1] - 1)))

host.set_xlim(0, ys.shape[1] - 1)
host.set_xticks(range(ys.shape[1]))
host.set_xticklabels(ynames, fontsize=14)
host.tick_params(axis='x', which='major', pad=7)
host.spines['right'].set_visible(False)
host.xaxis.tick_top()
host.set_title('Parallel Coordinates Plot', fontsize=18)

colors = plt.cm.tab10.colors
for j in range(N):
    # to just draw straight lines between the axes:
    # host.plot(range(ys.shape[1]), zs[j,:], c=colors[(category[j] - 1) % len(colors) ])

    # create bezier curves
    # for each axis, there will a control vertex at the point itself, one at 1/3rd towards the previous and one
    #   at one third towards the next axis; the first and last axis have one less control vertex
    # x-coordinate of the control vertices: at each integer (for the axes) and two inbetween
    # y-coordinate: repeat every point three times, except the first and last only twice
    verts = list(zip([x for x in np.linspace(0, len(ys) - 1, len(ys) * 3 - 2, endpoint=True)],
                     np.repeat(zs[j, :], 3)[1:-1]))
    # for x,y in verts: host.plot(x, y, 'go') # to show the control points of the beziers
    codes = [Path.MOVETO] + [Path.CURVE4 for _ in range(len(verts) - 1)]
    path = Path(verts, codes)
    patch = patches.PathPatch(path, facecolor='none', lw=1, edgecolor=colors[category[j] - 1])
    host.add_patch(patch)
plt.tight_layout()
plt.show()

これは、アイリスデータセットの同様のコードです。2番目の軸は、いくつかの交差線を避けるために逆になっています。

import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches
import numpy as np
from sklearn import datasets

iris = datasets.load_iris()
ynames = iris.feature_names
ys = iris.data
ymins = ys.min(axis=0)
ymaxs = ys.max(axis=0)
dys = ymaxs - ymins
ymins -= dys * 0.05  # add 5% padding below and above
ymaxs += dys * 0.05

ymaxs[1], ymins[1] = ymins[1], ymaxs[1]  # reverse axis 1 to have less crossings
dys = ymaxs - ymins

# transform all data to be compatible with the main axis
zs = np.zeros_like(ys)
zs[:, 0] = ys[:, 0]
zs[:, 1:] = (ys[:, 1:] - ymins[1:]) / dys[1:] * dys[0] + ymins[0]

fig, host = plt.subplots(figsize=(10,4))

axes = [host] + [host.twinx() for i in range(ys.shape[1] - 1)]
for i, ax in enumerate(axes):
    ax.set_ylim(ymins[i], ymaxs[i])
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    if ax != host:
        ax.spines['left'].set_visible(False)
        ax.yaxis.set_ticks_position('right')
        ax.spines["right"].set_position(("axes", i / (ys.shape[1] - 1)))

host.set_xlim(0, ys.shape[1] - 1)
host.set_xticks(range(ys.shape[1]))
host.set_xticklabels(ynames, fontsize=14)
host.tick_params(axis='x', which='major', pad=7)
host.spines['right'].set_visible(False)
host.xaxis.tick_top()
host.set_title('Parallel Coordinates Plot — Iris', fontsize=18, pad=12)

colors = plt.cm.Set2.colors
legend_handles = [None for _ in iris.target_names]
for j in range(ys.shape[0]):
    # create bezier curves
    verts = list(zip([x for x in np.linspace(0, len(ys) - 1, len(ys) * 3 - 2, endpoint=True)],
                     np.repeat(zs[j, :], 3)[1:-1]))
    codes = [Path.MOVETO] + [Path.CURVE4 for _ in range(len(verts) - 1)]
    path = Path(verts, codes)
    patch = patches.PathPatch(path, facecolor='none', lw=2, alpha=0.7, edgecolor=colors[iris.target[j]])
    legend_handles[iris.target[j]] = patch
    host.add_patch(patch)
host.legend(legend_handles, iris.target_names,
            loc='lower center', bbox_to_anchor=(0.5, -0.18),
            ncol=len(iris.target_names), fancybox=True, shadow=True)
plt.tight_layout()
plt.show()

score 17 · Accepted Answer

それを行うためのより良い方法があると確信していますが、ここに簡単で汚いもの（本当に汚いもの）があります：

#!/usr/bin/python
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

#vectors to plot: 4D for this example
y1=[1,2.3,8.0,2.5]
y2=[1.5,1.7,2.2,2.9]

x=[1,2,3,8] # spines

fig,(ax,ax2,ax3) = plt.subplots(1, 3, sharey=False)

# plot the same on all the subplots
ax.plot(x,y1,'r-', x,y2,'b-')
ax2.plot(x,y1,'r-', x,y2,'b-')
ax3.plot(x,y1,'r-', x,y2,'b-')

# now zoom in each of the subplots 
ax.set_xlim([ x[0],x[1]])
ax2.set_xlim([ x[1],x[2]])
ax3.set_xlim([ x[2],x[3]])

# set the x axis ticks 
for axx,xx in zip([ax,ax2,ax3],x[:-1]):
  axx.xaxis.set_major_locator(ticker.FixedLocator([xx]))
ax3.xaxis.set_major_locator(ticker.FixedLocator([x[-2],x[-1]]))  # the last one

# EDIT: add the labels to the rightmost spine
for tick in ax3.yaxis.get_major_ticks():
  tick.label2On=True

# stack the subplots together
plt.subplots_adjust(wspace=0)

plt.show()

これは基本的に、Joe Kingon、Python / Matplotlibによる（はるかに優れた）ものに基づいています-不連続な軸を作成する方法はありますか？。また、同じ質問に対する他の回答も確認することをお勧めします。

この例では、垂直スケールをスケーリングしようとはしていません。これは、正確に何を達成しようとしているかによって異なるためです。

編集：これが結果ですここに画像の説明を入力してください

score 11 · Accepted Answer

パンダを使用する場合（シータで提案されているように）、軸を個別にスケーリングする方法はありません。

異なる垂直軸が見つからない理由は、垂直軸がないためです。私たちの平行座標は、垂直線といくつかのラベルを描くだけで、他の2つの軸を「偽造」しています。

https://github.com/pydata/pandas/issues/7083#issuecomment-74253671

score 2 · Accepted Answer

plotlyには、parallel_coordinatesと呼ばれる優れたインタラクティブなソリューションがあります。これは問題なく機能します。

import plotly.express as px
df = px.data.iris()
fig = px.parallel_coordinates(df, color="species_id", labels={"species_id": "Species",
                "sepal_width": "Sepal Width", "sepal_length": "Sepal Length",
                "petal_width": "Petal Width", "petal_length": "Petal Length", },
                             color_continuous_scale=px.colors.diverging.Tealrose,
                             color_continuous_midpoint=2)
fig.show()

score 1 · Accepted Answer

Matplotlibに基づくPaxplotと呼ばれるベータリリースされた平行座標プロットパッケージをプラグインしたいと思います。他の回答と同様の基本ロジックを使用し、クリーンな使用法を維持しながら機能を拡張します。

ドキュメントには、基本的な使用法、高度な使用法、およびPandasでの使用法の例が記載されています。元の質問で提供された図のように、私は虹彩データセットをプロットするソリューションを提供しました。

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris
import paxplot

# Import data
iris = load_iris(as_frame=True)
df = pd.DataFrame(
    data=np.c_[iris['data'], iris['target']],
    columns=iris['feature_names'] + ['target']
)
cols = df.columns

# Create figure
paxfig = paxplot.pax_parallel(n_axes=len(cols))
paxfig.plot(df.to_numpy())

# Add labels
paxfig.set_labels(cols)

# Set ticks
paxfig.set_ticks(
    ax_idx=-1,
    ticks=[0, 1, 2],
    labels=iris.target_names
)

# Add colorbar
color_col = 0
paxfig.add_colorbar(
    ax_idx=color_col,
    cmap='viridis',
    colorbar_kwargs={'label': cols[color_col]}
)

plt.show()

アイリスプロット

完全に開示するために、私はPaxplotを作成し、それを何人かの友人と開発および保守してきました。貢献に興味があれば、ぜひお気軽にご連絡ください。

score 0 · Accepted Answer

私がこれまでに見た最良の例はこれです

https://python.g-node.org/python-summerschool-2013/_media/wiki/datavis/olympics_vis.py

normalized_coordinates関数を参照してください。超高速ではありませんが、私が試したことから動作します。

normalised_coordinates(['VAL_1', 'VAL_2', 'VAL_3'], np.array([[1230.23, 1500000, 12453.03], [930.23, 140000, 12453.03], [130.23, 120000, 1243.03]]), [1, 2, 1])

score 0 · Accepted Answer

まだ完璧にはほど遠いですが、機能し、比較的短いです：

import numpy as np

import matplotlib.pyplot as plt

def plot_parallel(data,labels):

    data=np.array(data)
    x=list(range(len(data[0])))
    fig, axis = plt.subplots(1, len(data[0])-1, sharey=False)


    for d in data:
        for i, a in enumerate(axis):
            temp=d[i:i+2].copy()
            temp[1]=(temp[1]-np.min(data[:,i+1]))*(np.max(data[:,i])-np.min(data[:,i]))/(np.max(data[:,i+1])-np.min(data[:,i+1]))+np.min(data[:,i])
            a.plot(x[i:i+2], temp)


    for i, a in enumerate(axis):
        a.set_xlim([x[i], x[i+1]])
        a.set_xticks([x[i], x[i+1]])
        a.set_xticklabels([labels[i], labels[i+1]], minor=False, rotation=45)
        a.set_ylim([np.min(data[:,i]),np.max(data[:,i])])


    plt.subplots_adjust(wspace=0)

    plt.show()

score 0 · Accepted Answer

@JohanCコードをpandasデータフレームに適合させ、カテゴリ変数でも機能するように拡張しました。データフレームの最初の変数として数値変数も配置できるようにするなど、コードをさらに改善する必要がありますが、今のところは良いと思います。


# Paths:
path_data = "data/"

# Packages:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.path import Path
import matplotlib.patches as patches
from functools import reduce

# Display options:
pd.set_option("display.width", 1200)
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 300)

# Dataset:
df = pd.read_csv(path_data + "nasa_exoplanets.csv")
df_varnames = pd.read_csv(path_data + "nasa_exoplanets_var_names.csv")

# Variables (the first variable must be categoric):
my_vars = ["discoverymethod", "pl_orbper", "st_teff", "disc_locale", "sy_gaiamag"]
my_vars_names = reduce(pd.DataFrame.append,
                       map(lambda i: df_varnames[df_varnames["var"] == i], my_vars))
my_vars_names = my_vars_names["var_name"].values.tolist()

# Adapt the data:
df = df.loc[df["pl_letter"] == "d"]
df_plot = df[my_vars]
df_plot = df_plot.dropna()
df_plot = df_plot.reset_index(drop = True)

# Convert to numeric matrix:
ym = []
dics_vars = []
for v, var in enumerate(my_vars):
    if df_plot[var].dtype.kind not in ["i", "u", "f"]:
        dic_var = dict([(val, c) for c, val in enumerate(df_plot[var].unique())])
        dics_vars += [dic_var]
        ym += [[dic_var[i] for i in df_plot[var].tolist()]]
    else:
        ym += [df_plot[var].tolist()]
ym = np.array(ym).T

# Padding:
ymins = ym.min(axis = 0)
ymaxs = ym.max(axis = 0)
dys = ymaxs - ymins
ymins -= dys*0.05
ymaxs += dys*0.05

# Reverse some axes for better visual:
axes_to_reverse = [0, 1]
for a in axes_to_reverse:
    ymaxs[a], ymins[a] = ymins[a], ymaxs[a]
dys = ymaxs - ymins

# Adjust to the main axis:
zs = np.zeros_like(ym)
zs[:, 0] = ym[:, 0]
zs[:, 1:] = (ym[:, 1:] - ymins[1:])/dys[1:]*dys[0] + ymins[0]

# Colors:
n_levels = len(dics_vars[0])
my_colors = ["#F41E1E", "#F4951E", "#F4F01E", "#4EF41E", "#1EF4DC", "#1E3CF4", "#F41EF3"]
cmap = LinearSegmentedColormap.from_list("my_palette", my_colors)
my_palette = [cmap(i/n_levels) for i in np.array(range(n_levels))]

# Plot:
fig, host_ax = plt.subplots(
    figsize = (20, 10),
    tight_layout = True
)

# Make the axes:
axes = [host_ax] + [host_ax.twinx() for i in range(ym.shape[1] - 1)]
dic_count = 0
for i, ax in enumerate(axes):
    ax.set_ylim(
        bottom = ymins[i],
        top = ymaxs[i]
    )
    ax.spines.top.set_visible(False)
    ax.spines.bottom.set_visible(False)
    ax.ticklabel_format(style = 'plain')
    if ax != host_ax:
        ax.spines.left.set_visible(False)
        ax.yaxis.set_ticks_position("right")
        ax.spines.right.set_position(
            (
                "axes",
                 i/(ym.shape[1] - 1)
             )
        )
    if df_plot.iloc[:, i].dtype.kind not in ["i", "u", "f"]:
        dic_var_i = dics_vars[dic_count]
        ax.set_yticks(
            range(len(dic_var_i))
        )
        ax.set_yticklabels(
            [key_val for key_val in dics_vars[dic_count].keys()]
        )
        dic_count += 1
host_ax.set_xlim(
    left = 0,
    right = ym.shape[1] - 1
)
host_ax.set_xticks(
    range(ym.shape[1])
)
host_ax.set_xticklabels(
    my_vars_names,
    fontsize = 14
)
host_ax.tick_params(
    axis = "x",
    which = "major",
    pad = 7
)

# Make the curves:
host_ax.spines.right.set_visible(False)
host_ax.xaxis.tick_top()
for j in range(ym.shape[0]):
    verts = list(zip([x for x in np.linspace(0, len(ym) - 1, len(ym)*3 - 2, 
                                             endpoint = True)],
                 np.repeat(zs[j, :], 3)[1: -1]))
    codes = [Path.MOVETO] + [Path.CURVE4 for _ in range(len(verts) - 1)]
    path = Path(verts, codes)
    color_first_cat_var = my_palette[dics_vars[0][df_plot.iloc[j, 0]]]
    patch = patches.PathPatch(
        path,
        facecolor = "none",
        lw = 2,
        alpha = 0.7,
        edgecolor = color_first_cat_var
    )
    host_ax.add_patch(patch)

python - Matplotlibの平行座標プロット

9 に答える 9

Related

Reference