pandas python dataframe,根据各种条件插入行

8yparm6h  于 2023-04-04  发布在  Python
关注(0)|答案(1)|浏览(136)

在下面的数据框中,每一行表示一天中的一个班次的一部分。

import pandas as pd

df = pd.DataFrame()

df['number'] = '169', '169', '169', '169', '2129', '2129', '2129', '2129', '2129', '2129', '5645', '5645', '5645', '5645', '5645', '5645', '5645', '5645', '7480', '7480', '7480', '7480', '7480', '7480', '7480', '6713', '6713', '6713', '6713', '6713', '6713', '6713', '6713','4004',   '4004', '4004', '4004', '4004', '4004'
df['date'] ='2/03/2023' ,'4/03/2023',   '4/03/2023','5/03/2023',    '2/03/2023',    '2/03/2023',    '2/03/2023' ,'4/03/2023',   '4/03/2023',    '5/03/2023',    '27/02/2023',   '27/02/2023',   '27/02/2023',   '28/02/2023',   '3/03/2023',    '3/03/2023',    '3/03/2023',    '3/03/2023',    '1/03/2023',    '2/03/2023',    '3/03/2023',    '4/03/2023',    '4/03/2023',    '5/03/2023',    '5/03/2023','2/03/2023',    '2/03/2023',    '2/03/2023' ,'4/03/2023',   '4/03/2023',    '4/03/2023',    '4/03/2023' ,'5/03/2023','28/02/2023',  '1/03/2023',    '2/03/2023',    '3/03/2023',    '4/03/2023',    '4/03/2023'
df['start'] = '21:00:00'    ,'21:00:00',    '21:00:00', '21:00:00', '6:00:00',  '6:00:00',  '6:00:00',  '14:30:00', '14:30:00', '14:30:00', '15:00:00', '15:00:00', '15:00:00', '21:00:00', '15:00:00', '15:00:00', '15:00:00', '15:00:00', '7:00:00',  '7:00:00','7:00:00' ,'7:00:00', '7:00:00',  '7:00:00',  '14:30:00', '6:30:00',  '6:30:00',  '6:30:00',  '6:30:00','6:30:00',    '6:30:00',  '6:30:00','14:30:00','22:45:00',    '22:45:00', '22:45:00', '22:45:00', '22:45:00', '22:45:00'
df['end'] = '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '21:00:00', '21:00:00', '21:00:00', '21:00:00', '21:00:00', '21:00:00', '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '13:45:00', '13:45:00'  ,'15:30:00',    '15:15:00', '15:15:00', '13:30:00', '23:00:00', '22:30:00', '22:30:00', '22:30:00', '21:00:00', '21:00:00', '21:00:00', '21:00:00'  ,'16:15:00','7:15:00','7:15:00','7:15:00','7:15:00','7:15:00','7:15:00'
df['hours'] = '9.5', '2.5', '7', '9.5', '10', '2', '2', '3.5', '2.5', '6', '10', '2', '3', '9.5', '10', '2', '1.5', '1.5', '6.25', '6.25', '8', '1', '6.75', '6', '8', '10', '2', '3', '2', '8', '2', '1.5', '1.75', '8', '8', '8', '8', '6', '2'
df['day'] = 'Thursday', 'Saturday', 'Saturday', 'Sunday', 'Thursday', 'Thursday', 'Thursday', 'Saturday', 'Saturday', 'Sunday', 'Monday', 'Monday', 'Monday', 'Tuesday', 'Friday', 'Friday', 'Friday', 'Friday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Saturday', 'Sunday', 'Sunday', 'Thursday', 'Thursday', 'Thursday', 'Saturday', 'Saturday', 'Saturday', 'Saturday', 'Sunday','Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Saturday'

print(df)

对于每一行,如果“day”列中的单元格是星期五,星期六或星期日,并且“start”列是在午夜之前,并且“end”列是在午夜之后,则具有通过午夜的“hours”余额的行(让我们称该行为“old row”)我需要做以下操作。通过“通过午夜”,我将引用的示例是编号“169”在星期六的21:00开始;由于星期六的第一行是2.5小时(即23:30结束),因此第二行是关于星期六的7小时,“经过午夜”。
1.直接在“旧行”下面插入一行(让我们称该行为“新行”)
1.“旧行”“小时”数据需要更改为当天整个班次必须达到午夜的剩余时间。(例如,对于“Aberash”,7个小时中仅需要0.5小时才能达到午夜(如前一行结束于23:30)
1.新行接收“小时”的余额(即7小时-0.5小时= 6.5小时)
1.新行“day”列数据更改为以下内容:
i)如果“旧行”是星期五,则新行是星期五/星期六
ii)如果“旧行”是星期六,则新行是星期六/星期日
iii)如果“旧行”是星期日,则新行是星期日/星期一
“新行”中未讨论的其他数据应与“旧行”相同。
下面是数据框在上面之后的样子

import pandas as pd

df = pd.DataFrame()

df['number'] = '169', '169', '169', '169', '169', '2129', '2129', '2129', '2129', '2129', '2129', '5645', '5645', '5645', '5645', '5645', '5645', '5645', '5645', '5645', '7480', '7480', '7480', '7480', '7480', '7480', '7480', '6713', '6713', '6713', '6713', '6713', '6713', '6713', '6713','4004',    '4004','4004',  '4004', '4004', '4004', '4004', '4004'
df['date'] = '2/03/2023','4/03/2023','4/03/2023',   '4/03/2023','5/03/2023',    '2/03/2023',    '2/03/2023',    '2/03/2023' ,'4/03/2023',   '4/03/2023',    '5/03/2023',    '27/02/2023',   '27/02/2023',   '27/02/2023',   '28/02/2023',   '3/03/2023',    '3/03/2023',    '3/03/2023',    '3/03/2023',    '3/03/2023',    '1/03/2023',    '2/03/2023',    '3/03/2023',    '4/03/2023',    '4/03/2023',    '5/03/2023',    '5/03/2023','2/03/2023',    '2/03/2023',    '2/03/2023' ,'4/03/2023',   '4/03/2023',    '4/03/2023',    '4/03/2023' ,'5/03/2023','28/02/2023',  '1/03/2023',    '2/03/2023',    '3/03/2023',    '3/03/2023',    '4/03/2023',    '4/03/2023',    '4/03/2023'
df['start'] = '21:00:00','21:00:00','21:00:00', '21:00:00', '21:00:00', '6:00:00',  '6:00:00',  '6:00:00',  '14:30:00', '14:30:00', '14:30:00', '15:00:00', '15:00:00', '15:00:00', '21:00:00', '15:00:00', '15:00:00', '15:00:00', '15:00:00','15:00:00',  '7:00:00',  '7:00:00','7:00:00' ,'7:00:00', '7:00:00',  '7:00:00',  '14:30:00', '6:30:00',  '6:30:00',  '6:30:00',  '6:30:00','6:30:00',    '6:30:00',  '6:30:00','14:30:00','22:45:00',    '22:45:00', '22:45:00', '22:45:00', '22:45:00', '22:45:00', '22:45:00', '22:45:00'
df['end'] = '7:00:00',  '7:00:00','7:00:00',    '7:00:00',  '7:00:00',  '21:00:00', '21:00:00', '21:00:00', '21:00:00', '21:00:00', '21:00:00', '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '13:45:00', '13:45:00'  ,'15:30:00',    '15:15:00', '15:15:00', '13:30:00', '23:00:00', '22:30:00', '22:30:00', '22:30:00', '21:00:00', '21:00:00', '21:00:00', '21:00:00'  ,'16:15:00','7:15:00','7:15:00','7:15:00','7:15:00','7:15:00','7:15:00','7:15:00','7:15:00'
df['hours'] = '9.5', '2.5', '0.5', '6.5', '9.5', '10', '2', '2', '3.5', '2.5', '6', '10', '2', '3', '9.5', '9', '1', '2', '1.5', '1.5', '6.25', '6.25', '8', '1', '6.75', '6', '8', '10', '2', '3', '2', '8', '2', '1.5', '1.75', '8', '8', '8', '1.25', '6.75', '1.25', '4.75', '2'
df['day'] = 'Thursday', 'Saturday', 'Saturday', 'Saturday/Sunday', 'Sunday', 'Thursday', 'Thursday', 'Thursday', 'Saturday', 'Saturday', 'Sunday', 'Monday', 'Monday', 'Monday', 'Tuesday', 'Friday', 'Friday/Saturday', 'Friday', 'Friday', 'Friday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Saturday', 'Sunday', 'Sunday', 'Thursday', 'Thursday', 'Thursday', 'Saturday', 'Saturday', 'Saturday', 'Saturday', 'Sunday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Friday/Saturday', 'Saturday', 'Saturday/Sunday', 'Saturday'

print(df)
6vl6ewon

6vl6ewon1#

这个解决方案很难看,很慢,而且通常无法管理。写它让我真的很生气。
这证明了为什么选择正确的数据结构会导致更干净的代码。无论如何,它是:

import pandas as pd
from datetime import timedelta

df = pd.DataFrame()

df['number'] = '169', '169', '169', '169', '2129', '2129', '2129', '2129', '2129', '2129', '5645', '5645', '5645', '5645', '5645', '5645', '5645', '5645', '7480', '7480', '7480', '7480', '7480', '7480', '7480', '6713', '6713', '6713', '6713', '6713', '6713', '6713', '6713','4004',   '4004', '4004', '4004', '4004', '4004'
df['date'] ='2/03/2023' ,'4/03/2023',   '4/03/2023','5/03/2023',    '2/03/2023',    '2/03/2023',    '2/03/2023' ,'4/03/2023',   '4/03/2023',    '5/03/2023',    '27/02/2023',   '27/02/2023',   '27/02/2023',   '28/02/2023',   '3/03/2023',    '3/03/2023',    '3/03/2023',    '3/03/2023',    '1/03/2023',    '2/03/2023',    '3/03/2023',    '4/03/2023',    '4/03/2023',    '5/03/2023',    '5/03/2023','2/03/2023',    '2/03/2023',    '2/03/2023' ,'4/03/2023',   '4/03/2023',    '4/03/2023',    '4/03/2023' ,'5/03/2023','28/02/2023',  '1/03/2023',    '2/03/2023',    '3/03/2023',    '4/03/2023',    '4/03/2023'
df['start'] = '21:00:00'    ,'21:00:00',    '21:00:00', '21:00:00', '6:00:00',  '6:00:00',  '6:00:00',  '14:30:00', '14:30:00', '14:30:00', '15:00:00', '15:00:00', '15:00:00', '21:00:00', '15:00:00', '15:00:00', '15:00:00', '15:00:00', '7:00:00',  '7:00:00','7:00:00' ,'7:00:00', '7:00:00',  '7:00:00',  '14:30:00', '6:30:00',  '6:30:00',  '6:30:00',  '6:30:00','6:30:00',    '6:30:00',  '6:30:00','14:30:00','22:45:00',    '22:45:00', '22:45:00', '22:45:00', '22:45:00', '22:45:00'
df['end'] = '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '21:00:00', '21:00:00', '21:00:00', '21:00:00', '21:00:00', '21:00:00', '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '7:00:00',  '13:45:00', '13:45:00'  ,'15:30:00',    '15:15:00', '15:15:00', '13:30:00', '23:00:00', '22:30:00', '22:30:00', '22:30:00', '21:00:00', '21:00:00', '21:00:00', '21:00:00'  ,'16:15:00','7:15:00','7:15:00','7:15:00','7:15:00','7:15:00','7:15:00'
df['hours'] = '9.5', '2.5', '7', '9.5', '10', '2', '2', '3.5', '2.5', '6', '10', '2', '3', '9.5', '10', '2', '1.5', '1.5', '6.25', '6.25', '8', '1', '6.75', '6', '8', '10', '2', '3', '2', '8', '2', '1.5', '1.75', '8', '8', '8', '8', '6', '2'
df['day'] = 'Thursday', 'Saturday', 'Saturday', 'Sunday', 'Thursday', 'Thursday', 'Thursday', 'Saturday', 'Saturday', 'Sunday', 'Monday', 'Monday', 'Monday', 'Tuesday', 'Friday', 'Friday', 'Friday', 'Friday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Saturday', 'Sunday', 'Sunday', 'Thursday', 'Thursday', 'Thursday', 'Saturday', 'Saturday', 'Saturday', 'Saturday', 'Sunday','Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Saturday'

# converting to more sensible data types
df['date'] = pd.to_datetime(df['date'], dayfirst=True)
df['hours'] = df['hours'].astype(float)

# df_okay will be the sink for the non-problematic shifts
# any shift outside of Friday, Saturday and Sunday aren't our concern
df_okay = df[~df['day'].isin(['Friday', 'Saturday','Sunday'])].copy()

df = df[df['day'].isin(['Friday', 'Saturday','Sunday'])].copy()

# let's use datetime objects to compare dates and calculate durations
df['start_dt'] = pd.to_datetime(df['date'].dt.strftime('%Y-%m-%d ') + df['start'])
df['end_dt'] = pd.to_datetime(df['date'].dt.strftime('%Y-%m-%d ') + df['end'])
# let's add a day for the end datetime of shifts which go over midnight
df.loc[df['start_dt']>df['end_dt'], 'end_dt'] += timedelta(1)

# any shift which doesn't go over midnight can't be problematic
# so it goes in the sink
df_okay = pd.concat((df_okay, df[df['start_dt'].dt.date == df['end_dt'].dt.date]))

# these shifts go over midnight so they're at risk
df = df[df['start_dt'].dt.date != df['end_dt'].dt.date]

# let's calculate the total number of hours for each `number` during the shift
df_problem_shifts = df[['number', 'start_dt', 'end_dt', 'hours']].groupby(['number','start_dt', 'end_dt']).sum().reset_index().rename(columns={'hours':'total_hours'})

# let's calculate the number of available hours between the beginning of the shift and midnight
df_problem_shifts['to_midnight'] = (pd.to_datetime(df_problem_shifts['start_dt'].dt.date + timedelta(days=1)) - df_problem_shifts['start_dt'] ).dt.total_seconds()/3600

# these shifts have a sum of `hours` which is more than the available hours until midnight
df_problem_shifts = df_problem_shifts[df_problem_shifts['to_midnight'] < df_problem_shifts['total_hours']]

# simple dictionnary which will be used to replace `day` with `day/day+1`
days = ['Friday', 'Saturday', 'Sunday', 'Monday']
days = {day: f"{day}/{day2}" for (day, day2) in zip(days, days[1:])}
new_rows = []
for i,row in df_problem_shifts.iterrows():
    df_hours = df.loc[(df.number == row.number) & (df.start_dt == row.start_dt) & (df.end_dt == row.end_dt)]
    available_hours = row.to_midnight
    for j,row2 in df_hours.iterrows():
        if available_hours == 0:
            df.loc[j,'day'] = days[df.loc[j, 'day']]
        else:
            available_hours -= row2.hours
            if available_hours < 0:
                df.loc[j, 'hours'] = row2.hours + available_hours
                row2.day = days[row2.day]
                row2.hours = - available_hours
                available_hours = 0
                new_rows.append(row2)
new_rows = pd.DataFrame(new_rows)

# at this point df contains the shift rows for all
# shifts which had to be modified
df = pd.concat((df, new_rows))

# putting back all the shifts which were
# previously declared non-problematic
df = pd.concat((df_okay,df)).drop(['start_dt', 'end_dt'], axis=1)
df.sort_values(['number', 'date'])

相关问题