python - matplotlib: データのない期間をスキップしながら時系列をプロットする

Question

tl;dr:時系列のプロット中にデータがない期間をスキップするにはどうすればよいですか?

長い計算を実行しており、その進行状況を監視したいと考えています。時々、この計算を中断します。ログは、次のような巨大な CSV ファイルに保存されます。

2016-01-03T01:36:30.958199,0,0,0,startup
2016-01-03T01:36:32.363749,10000,0,0,regular
...
2016-01-03T11:12:21.082301,51020000,13402105,5749367,regular
2016-01-03T11:12:29.065687,51030000,13404142,5749367,regular
2016-01-03T11:12:37.657022,51040000,13408882,5749367,regular
2016-01-03T11:12:54.236950,51050000,13412824,5749375,shutdown
2016-01-03T19:02:38.293681,51050000,13412824,5749375,startup
2016-01-03T19:02:49.296161,51060000,13419181,5749377,regular
2016-01-03T19:03:00.547644,51070000,13423127,5749433,regular
2016-01-03T19:03:05.599515,51080000,13427189,5750183,regular
...

実際には、41 列あります。各列は、進行状況の特定の指標です。2 番目の列は常に 10000 単位で増分されます。最後の列は一目瞭然です。

「シャットダウン」と「起動」の間の期間をスキップしながら、同じグラフに各列をプロットしたいと思います。理想的には、スキップごとに垂直線も引きたいと思います。

ここに私がこれまでに持っているものがあります:

import matplotlib.pyplot as plt
import pandas as pd

# < ... reading my CSV in a Pandas dataframe `df` ... >

fig, ax = plt.subplots()

for col in ['total'] + ['%02d' % i for i in range(40)]:
    ax.plot_date(df.index.values, df[col].values, '-')

fig.autofmt_xdate()
plt.show()

その長いフラット期間を取り除き、代わりに垂直線だけを描きたい.

については知ってdf.plot()いますが、私の経験では壊れています (とりわけ、Pandasdatetimeはオブジェクトを独自の形式で変換し、とを使用しdate2numませんnum2date)。

考えられる解決策は、カスタム scalerを作成することですが、それはかなり複雑に思えます。

私が理解している限り、カスタムLocatorを作成しても目盛りの位置 (小さな垂直線と関連するラベル) のみが変更され、プロット自体の位置は変更されません。あれは正しいですか？

UPD:簡単な解決策は、タイムスタンプを変更することです (たとえば、「開始からの経過時間」に再計算します) が、それらを保持することをお勧めします。

UPD: https://stackoverflow.com/a/5657491/1214547の回答は、いくつかの変更を加えて機能します。私はすぐに私の解決策を書きます。

score 1 · Accepted Answer

ここに私のために働く解決策があります。近くに配置されたブレークをうまく処理できません (ラベルが混雑しすぎる可能性があります) が、私の場合は問題ではありません。

import bisect
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.scale as mscale
import matplotlib.transforms as mtransforms
import matplotlib.dates as mdates
import pandas as pd

# heavily borrows from http://stackoverflow.com/a/5657491/1214547

def CustomScaleFactory(breaks):
    class CustomScale(mscale.ScaleBase):
        name = 'custom'

        def __init__(self, axis, **kwargs):
            mscale.ScaleBase.__init__(self)

        def get_transform(self):
            return self.CustomTransform()

        def set_default_locators_and_formatters(self, axis):
            class HourSkippingLocator(mdates.HourLocator):
                _breaks = breaks
                def __init__(self, *args, **kwargs):
                    super(HourSkippingLocator, self).__init__(*args, **kwargs)

                def _tick_allowed(self, tick):
                    for left, right in self._breaks:
                        if left <= tick <= right:
                            return False
                    return True

                def __call__(self):
                    ticks = super(HourSkippingLocator, self).__call__()
                    ticks = [tick for tick in ticks if self._tick_allowed(tick)]
                    ticks.extend(right for (left, right) in self._breaks)
                    return ticks

            axis.set_major_locator(HourSkippingLocator(interval=3))
            axis.set_major_formatter(mdates.DateFormatter("%h %d, %H:%M"))

        class CustomTransform(mtransforms.Transform):
            input_dims = 1
            output_dims = 1
            is_separable = True
            has_inverse = True
            _breaks = breaks

            def __init__(self):
                mtransforms.Transform.__init__(self)

            def transform_non_affine(self, a):
                # I have tried to write something smart using np.cumsum(),
                # but failed, since it was too complicated to handle the
                # transformation for points within breaks.
                # On the other hand, these loops are very easily translated
                # in plain C.

                result = np.empty_like(a)

                a_idx = 0
                csum = 0
                for left, right in self._breaks:
                    while a_idx < len(a) and a[a_idx] < left:
                        result[a_idx] = a[a_idx] - csum
                        a_idx += 1
                    while a_idx < len(a) and a[a_idx] <= right:
                        result[a_idx] = left - csum
                        a_idx += 1
                    csum += right - left

                while a_idx < len(a):
                    result[a_idx] = a[a_idx] - csum
                    a_idx += 1

                return result

            def inverted(self):
                return CustomScale.InvertedCustomTransform()

        class InvertedCustomTransform(mtransforms.Transform):
            input_dims = 1
            output_dims = 1
            is_separable = True
            has_inverse = True
            _breaks = breaks

            def __init__(self):
                mtransforms.Transform.__init__(self)

            def transform_non_affine(self, a):
                # Actually, this transformation isn't exactly invertible.
                # It may glue together some points, and there is no way
                # to separate them back. This implementation maps both
                # points to the *left* side of the break.

                diff = np.zeros(len(a))

                total_shift = 0

                for left, right in self._breaks:
                    pos = bisect.bisect_right(a, left - total_shift)
                    if pos >= len(diff):
                        break
                    diff[pos] = right - left
                    total_shift += right - left

                return a + diff.cumsum()

            def inverted(self):
                return CustomScale.CustomTransform()

    return CustomScale


# < ... reading my CSV in a Pandas dataframe `df` ... >

startups = np.where(df['kind'] == 'startup')[0]
shutdowns = np.where(df['kind'] == 'shutdown')[0]

breaks_idx = list(zip(shutdowns, startups[1:]))
breaks_dates = [(df.index[l], df.index[r]) for (l, r) in breaks_idx]
breaks = [(mdates.date2num(l), mdates.date2num(r)) for (l, r) in breaks_dates]

fig, ax = plt.subplots()

for col in ['total'] + ['%02d' % i for i in range(40)]:
  ax.plot_date(df.index.values, df[col].values, '-')

# shame on matplotlib: there is no way to unregister a scale
mscale.register_scale(CustomScaleFactory(breaks))
ax.set_xscale('custom')

vlines_x = [r for (l, r) in breaks]
vlines_ymin = np.zeros(len(vlines_x))
vlines_ymax = [df.iloc[r]['total'] for (l, r) in breaks_idx]
plt.vlines(vlines_x, vlines_ymin, vlines_ymax, color='darkgrey')

fig.autofmt_xdate()
plt.ticklabel_format(axis='y', style='plain')

plt.show()

python - matplotlib: データのない期間をスキップしながら時系列をプロットする

2 に答える 2

Related

Reference