假设我有以下时间序列数据框架：

import numpy as np
import pandas as pd
import random
np.random.seed(2019)

# Generate TS
#rng = pd.date_range('2019-01-01', freq='MS', periods=N)
ts = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H') #.strftime('%d-%m-%Y %H:%M:%S')    # freq='MS'set the frequency of date in months and start from day 1. You can use 'T' for minutes and so on

# number of samples
N = len(ts)

# Create a random dataset
data = {
    #"TS": ts,
    'Appx':  [random.choice(['App1', 'App2', 'App3', None]) for _ in range(N)], # generate categorical data including missing data "None"
    'VM':    [random.choice(['VM1' , 'VM2' ]) for _ in range(N)]
}

df = pd.DataFrame(data, index=ts)
#df.resample('M').mean().plot()
df
#                       Appx    VM
#2000-01-01 00:00:00    App1    VM2
#2000-01-01 01:00:00    None    VM1
#2000-01-01 02:00:00    None    VM2
#2000-01-01 03:00:00    App3    VM2
#2000-01-01 04:00:00    App1    VM1
#...    ... ...
#2000-12-31 19:00:00    App2    VM1
#2000-12-31 20:00:00    App3    VM1
#2000-12-31 21:00:00    App3    VM1
#2000-12-31 22:00:00    App1    VM1
#2000-12-31 23:00:00    App1    VM1
# 8784 rows × 2 columns

查看其他可用资源后：

Plotting categorical data counts over time
How to make a line plot from a dataframe with multiple categorical columns in matplotlib
How to groupby dataframe with categorical variables for making linechart in matplotlib?
How to plot by category over time
Plot time series on category level
Plotting multiple time series after a groupby in pandas
Pandas: plot multiple time series DataFrame into a single plot
Plot a line graph with categorical columns for each line
**问题：**在pandas框架内绘制分类变量的计数记录，包括其缺失值（None或NaN）
**我的尝试：**我尝试使用以下脚本绘制计数记录，但没有成功：首先，我使用了一个简单的示例，通过绘制每个所需VM随时间推移的缺失值（虚线）内的应用列记录，来描述受here启发的数据和缺失值：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

day = ([ 1 , 2 , 3, 4, 5 , 6 , 7 , 8 , 9])
App1 = ([0.6 , 0.8 , np.nan, np.nan, 4 , 6 , 6.5 ,7 , 8])
App2 = ([ 1 , 2 , np.nan, np.nan, 0.5 , 7 , 8 , 9 , 10])
App3 = ([ 1.5 , 2.5 , np.nan, np.nan, 3 , 4 , 6 , 8 , 11])

cf = pd.DataFrame({'App1': App1, 'App2': App2, 'App3': App3}, index = day)
cf.index.name = 'day'

fig, ax = plt.subplots()
line, = ax.plot(cf['App1'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App1'], color='k', lw=1.5, marker = 'v', label='App1',)
line, = ax.plot(cf['App2'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App2'], color='k', lw=1.5, marker = 's', label='App2')
line, = ax.plot(cf['App3'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App3'], color='k', lw=1.5, marker = 'o', label='App3')

plt.xlabel('Time Stamp')
plt.ylabel('Record counts')
plt.title('Apps within missing values for VM1')
plt.legend()
plt.show()

到目前为止，我的输出：

但是，当我将其应用于基于此answer生成的时间序列数据时，我得到了错误：

import matplotlib.pyplot as plt

df['Appx'].fillna(value=np.nan, inplace=True)
df['Appx'].astype('category') # or str for string
#df = df.astype(int)

# Filter the DataFrame by a list of string values in the "App1" column
filtered_df  = df[ df["Appx"].isin([np.nan])]
filtered_dff = df[~df["Appx"].isin([np.nan])]

cf = pd.DataFrame({'Appx': filtered_dff["Appx"]}, index = df.index)
#cf.index.name = df.index #'TS'

fig, ax = plt.subplots()
line, = ax.plot(cf['Appx'].fillna(method='ffill'), ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['Appx'], color=line.get_color(), lw=1.5, marker = 'o')
ax.tick_params(axis='x', labelrotation=45)
plt.xlabel('TS')
plt.ylabel('mm')
plt.legend('best')
plt.show()

TypeError：'value'必须是str或bytes的示例，而不是float
甚至我使用groupby()进一步挖掘：

# reset_index() gives a column for counting, after groupby uses year and category
ctdf = (df.reset_index()
          .groupby(['Appx','VM'], as_index=False)
          .count()
          # rename isn't strictly necessary here, it's just for readability
          .rename(columns={'index':'ct'})
       )
ctdf

#    Appx    VM  ct
#0  App1    VM1 1127
#1  App1    VM2 1084
#2  App2    VM1 1066
#3  App2    VM2 1098
#4  App3    VM1 1084
#5  App3    VM2 1049

df['Appx'].fillna(value=np.nan, inplace=True)
df['Appx'].astype('category') # or str for string
#df = df.astype(int)

# Filter the DataFrame by a list of string values in the "App1" column
filtered_df  = df[ df["Appx"].isin([np.nan])]
#filtered_dff = df[~df["Appx"].isin([np.nan])]

# reset_index() gives a column for counting, after groupby uses year and category
ctdff = (filtered_df
          #.isna()
          .reset_index()
          .groupby(['VM'], as_index=False)
          .count()
          # rename isn't strictly necessary here, it's just for readability
          .rename(columns={'index':'ct'})
       )
ctdff

#   VM   ct   Appx
#0  VM1 1153    0
#1  VM2 1123    0

类似于这个answer我可能感兴趣这样的情节所谓的cat_horizontal_plot：

注意：我对尽可能多地删除或估算解决方案不感兴趣：

在这些极端情况下，我不能显示缺失值：

import seaborn as sns
import matplotlib.pyplot as plt
sns.lineplot(data = df, x = df.index, y = 'Appx', hue = 'Appx', marker='o', alpha=0.2)
plt.legend(bbox_to_anchor=[0.5, 1.02], loc='lower center')
plt.xticks(rotation=45)
plt.show()

grouped = df.groupby(['VM','Appx'])

for key, group in grouped:
    data = group.groupby(lambda x: x.hour).count()
    data['Appx'].plot(label=key , legend=True)

如果要始终包含None或NaN的计数，则将它们转换为字符串。

# convert the None or NaN to a string
df.Appx = df.Appx.fillna('None')

# make the column categorical
df.Appx = pd.Categorical(values=df.Appx, categories=sorted(df.Appx.unique()), ordered=True)

月度盘点汇总条形图

# separate the dataframe by uniqe VM, and aggreate
vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='M'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}

# plot each VM separately
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)

for ax, key in zip(axes, vm.keys()):

    data = vm[key]['VM']

    data.plot(kind='bar', ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0)

    ax.set_xticks(ticks=range(len(data.index)), labels=data.index.strftime('%Y-%m'))

比例叠加条

vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='M'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)

for ax, key in zip(axes, vm.keys()):

    data = vm[key]['VM']

    percent = data.div(data.sum(axis=1), axis=0).mul(100).round(1)

    percent.plot(kind='bar', stacked=True, ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0)

    ax.set_xticks(ticks=range(len(data.index)), labels=data.index.strftime('%Y-%m'))

日计数聚合散点

# separate the dataframe by uniqe VM, and aggreate
vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='d'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}

# plot each VM separately
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)

for ax, key in zip(axes, vm.keys()):

    data = vm[key]['VM']

    data.plot(ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0, linestyle='none', marker='.')

迭代每一列以添加不同的marker或linestyle。

vm = {v: df[df.VM.eq(v)].groupby([pd.Grouper(level=0, freq='d'), 'Appx'], observed=False).count().unstack(level=1) for v in sorted(df.VM.unique())}

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10), tight_layout=True, sharex=True)

for ax, key in zip(axes, vm.keys()):

    data = vm[key]['VM']

    cols = data.columns

    linestyles = ['-', '-', '-', ':']
    markers = ['d', 'v', 's', '.']

    for col, ls, marker in zip(cols, linestyles, markers):
        data[col].plot(ax=ax, ylabel='Counts', xlabel='Date', title=f'VM: {key}', rot=0, linestyle=ls, marker=marker)

    ax.legend()

matplotlib 可视化分类变量的计数记录，包括其缺失值(“无”或“NaN”)

1条答案

月度盘点汇总条形图

比例叠加条

日计数聚合散点

相关问题

热门标签

最新问答