假设我有以下时间序列数据框架:
import numpy as np
import pandas as pd
import random
np.random.seed(2019)
# Generate TS
#rng = pd.date_range('2019-01-01', freq='MS', periods=N)
ts = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H') #.strftime('%d-%m-%Y %H:%M:%S') # freq='MS'set the frequency of date in months and start from day 1. You can use 'T' for minutes and so on
# number of samples
N = len(ts)
# Create a random dataset
data = {
#"TS": ts,
'Appx': [random.choice(['App1', 'App2', 'App3', None]) for _ in range(N)], # generate categorical data including missing data "None"
'VM': [random.choice(['VM1' , 'VM2' ]) for _ in range(N)]
}
df = pd.DataFrame(data, index=ts)
#df.resample('M').mean().plot()
df
# Appx VM
#2000-01-01 00:00:00 App1 VM2
#2000-01-01 01:00:00 None VM1
#2000-01-01 02:00:00 None VM2
#2000-01-01 03:00:00 App3 VM2
#2000-01-01 04:00:00 App1 VM1
#... ... ...
#2000-12-31 19:00:00 App2 VM1
#2000-12-31 20:00:00 App3 VM1
#2000-12-31 21:00:00 App3 VM1
#2000-12-31 22:00:00 App1 VM1
#2000-12-31 23:00:00 App1 VM1
# 8784 rows × 2 columns
查看其他可用资源后:
- Plotting categorical data counts over time
- How to make a line plot from a dataframe with multiple categorical columns in matplotlib
- How to groupby dataframe with categorical variables for making linechart in matplotlib?
- How to plot by category over time
- Plot time series on category level
- Plotting multiple time series after a groupby in pandas
- Pandas: plot multiple time series DataFrame into a single plot
- Plot a line graph with categorical columns for each line
**问题:**在pandas框架内绘制分类变量的计数记录,包括其缺失值(None
或NaN
)
**我的尝试:**我尝试使用以下脚本绘制计数记录,但没有成功:首先,我使用了一个简单的示例,通过绘制每个所需VM随时间推移的缺失值(虚线)内的应用列记录,来描述受here启发的数据和缺失值:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
day = ([ 1 , 2 , 3, 4, 5 , 6 , 7 , 8 , 9])
App1 = ([0.6 , 0.8 , np.nan, np.nan, 4 , 6 , 6.5 ,7 , 8])
App2 = ([ 1 , 2 , np.nan, np.nan, 0.5 , 7 , 8 , 9 , 10])
App3 = ([ 1.5 , 2.5 , np.nan, np.nan, 3 , 4 , 6 , 8 , 11])
cf = pd.DataFrame({'App1': App1, 'App2': App2, 'App3': App3}, index = day)
cf.index.name = 'day'
fig, ax = plt.subplots()
line, = ax.plot(cf['App1'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App1'], color='k', lw=1.5, marker = 'v', label='App1',)
line, = ax.plot(cf['App2'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App2'], color='k', lw=1.5, marker = 's', label='App2')
line, = ax.plot(cf['App3'].fillna(method='ffill'), color='r', ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['App3'], color='k', lw=1.5, marker = 'o', label='App3')
plt.xlabel('Time Stamp')
plt.ylabel('Record counts')
plt.title('Apps within missing values for VM1')
plt.legend()
plt.show()
到目前为止,我的输出:
但是,当我将其应用于基于此answer生成的时间序列数据时,我得到了错误:
import matplotlib.pyplot as plt
df['Appx'].fillna(value=np.nan, inplace=True)
df['Appx'].astype('category') # or str for string
#df = df.astype(int)
# Filter the DataFrame by a list of string values in the "App1" column
filtered_df = df[ df["Appx"].isin([np.nan])]
filtered_dff = df[~df["Appx"].isin([np.nan])]
cf = pd.DataFrame({'Appx': filtered_dff["Appx"]}, index = df.index)
#cf.index.name = df.index #'TS'
fig, ax = plt.subplots()
line, = ax.plot(cf['Appx'].fillna(method='ffill'), ls = '--', lw = 1, label='_nolegend_')
ax.plot(cf['Appx'], color=line.get_color(), lw=1.5, marker = 'o')
ax.tick_params(axis='x', labelrotation=45)
plt.xlabel('TS')
plt.ylabel('mm')
plt.legend('best')
plt.show()
TypeError:'value'必须是str或bytes的示例,而不是float
甚至我使用groupby()
进一步挖掘:
# reset_index() gives a column for counting, after groupby uses year and category
ctdf = (df.reset_index()
.groupby(['Appx','VM'], as_index=False)
.count()
# rename isn't strictly necessary here, it's just for readability
.rename(columns={'index':'ct'})
)
ctdf
# Appx VM ct
#0 App1 VM1 1127
#1 App1 VM2 1084
#2 App2 VM1 1066
#3 App2 VM2 1098
#4 App3 VM1 1084
#5 App3 VM2 1049
df['Appx'].fillna(value=np.nan, inplace=True)
df['Appx'].astype('category') # or str for string
#df = df.astype(int)
# Filter the DataFrame by a list of string values in the "App1" column
filtered_df = df[ df["Appx"].isin([np.nan])]
#filtered_dff = df[~df["Appx"].isin([np.nan])]
# reset_index() gives a column for counting, after groupby uses year and category
ctdff = (filtered_df
#.isna()
.reset_index()
.groupby(['VM'], as_index=False)
.count()
# rename isn't strictly necessary here, it's just for readability
.rename(columns={'index':'ct'})
)
ctdff
# VM ct Appx
#0 VM1 1153 0
#1 VM2 1123 0
类似于这个answer我可能感兴趣这样的情节所谓的cat_horizontal_plot
:
注意:我对尽可能多地删除或估算解决方案不感兴趣:
- Handling missing categorical values ML
- Applying OneHotEncoding on categorical data with missing values
- replace missing values in categorical data
- Deal with missing categorical data python
在这些极端情况下,我不能显示缺失值:
import seaborn as sns
import matplotlib.pyplot as plt
sns.lineplot(data = df, x = df.index, y = 'Appx', hue = 'Appx', marker='o', alpha=0.2)
plt.legend(bbox_to_anchor=[0.5, 1.02], loc='lower center')
plt.xticks(rotation=45)
plt.show()
grouped = df.groupby(['VM','Appx'])
for key, group in grouped:
data = group.groupby(lambda x: x.hour).count()
data['Appx'].plot(label=key , legend=True)
1条答案
按热度按时间6ovsh4lw1#
None
或NaN
的计数,则将它们转换为字符串。月度盘点汇总条形图
比例叠加条
日计数聚合散点
marker
或linestyle
。