pandas在小数据集上制作dummys

35g0bw71  于 2023-04-10  发布在  其他
关注(0)|答案(2)|浏览(127)

有人能帮我用这个功能制作假人吗:

def make_dummies(df):
    # Create dummies for all hours of the day
    hours = pd.get_dummies(df.index.hour, prefix='hour')

    # Create columns for hour, day of week, weekend, and month
    df['hour'] = df.index.strftime('%H')
    df['day_of_week'] = df.index.dayofweek
    df['weekend'] = np.where(df['day_of_week'].isin([5,6]), 1, 0)
    df['month'] = df.index.month

    # Create dummies for hours of the day
    hour_dummies = pd.get_dummies(df['hour'], prefix='hour')

    # Create dummies for all days of the week
    day_mapping = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday', 5: 'saturday', 6: 'sunday'}
    all_days = pd.Categorical(df['day_of_week'].map(day_mapping), categories=day_mapping.values())
    day_dummies = pd.get_dummies(all_days)

    # Create dummies for all months of the year
    month_mapping = {1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'jun', 7: 'jul',
                     8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov', 12: 'dec'}
    all_months = pd.Categorical(df['month'].map(month_mapping), categories=month_mapping.values())
    month_dummies = pd.get_dummies(all_months)

    # Merge all dummies with original DataFrame
    df = pd.concat([df, hours, hour_dummies, day_dummies, month_dummies], axis=1)

    # Drop redundant columns
    df = df.drop(['hour', 'day_of_week', 'month'], axis=1)

    return df

在这样的小数据集上:

import pandas as pd
import numpy as np

data = {"temp":[53.13,52.93,52.56,51.58,47.57],
        "Date":["2023-04-07 15:00:00-05:00","2023-04-07 16:00:00-05:00","2023-04-07 17:00:00-05:00","2023-04-07 18:00:00-05:00","2023-04-07 19:00:00-05:00"]
}

df = pd.DataFrame(data).set_index("Date")

# Converting the index as date
df.index = pd.to_datetime(df.index)

df = make_dummies(df)

print(df)

这不会正确地合并数据。我为截图道歉,但函数只是在下面堆叠虚拟变量,我希望所有的虚拟变量都被添加到df中,而不是堆叠在下面。希望这是有意义的,我希望做一个函数,为每个小时,月份和日期类型创建*所有虚拟变量。

pprl5pva

pprl5pva1#

这里的sci-kit learn版本看起来有点令人生畏,但似乎也很有效:

from sklearn.preprocessing import OneHotEncoder

def make_dummies(df):
    # Create a new DataFrame to hold the encoded data
    encoded_df = df.copy()

    # Create a OneHotEncoder object for hours of the day
    hour_encoder = OneHotEncoder(categories=[range(24)], sparse=False)

    # Encode the hour column
    hour_encoded = hour_encoder.fit_transform(encoded_df.index.hour.values.reshape(-1, 1))
    hour_columns = [f'hour_{i}' for i in range(24)]
    hour_df = pd.DataFrame(hour_encoded, columns=hour_columns, index=encoded_df.index)

    # Create a OneHotEncoder object for days of the week
    day_encoder = OneHotEncoder(categories=[range(7)], sparse=False)

    # Encode the day_of_week column
    day_encoded = day_encoder.fit_transform(encoded_df.index.dayofweek.values.reshape(-1, 1))
    day_columns = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
    day_df = pd.DataFrame(day_encoded, columns=day_columns, index=encoded_df.index)

    # Create a OneHotEncoder object for months of the year
    month_encoder = OneHotEncoder(categories=[range(1, 13)], sparse=False)

    # Encode the month column
    month_encoded = month_encoder.fit_transform(encoded_df.index.month.values.reshape(-1, 1))
    month_columns = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
    month_df = pd.DataFrame(month_encoded, columns=month_columns, index=encoded_df.index)

    # Merge all dummies with original DataFrame
    encoded_df = pd.concat([encoded_df, hour_df, day_df, month_df], axis=1)

    return encoded_df
6jygbczu

6jygbczu2#

您刚刚错过了一些set_index来对齐pd.concat上的索引:

def make_dummies(df):
    # Create dummies for all hours of the day
    hours = pd.get_dummies(df.index.hour, prefix='hour').set_index(df.index)  # HERE
    
    # Create columns for hour, day of week, weekend, and month
    df['hour'] = df.index.strftime('%H')
    df['day_of_week'] = df.index.dayofweek
    df['weekend'] = np.where(df['day_of_week'].isin([5,6]), 1, 0)
    df['month'] = df.index.month

    # Create dummies for hours of the day
    hour_dummies = pd.get_dummies(df['hour'], prefix='hour')

    # Create dummies for all days of the week
    day_mapping = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday', 5: 'saturday', 6: 'sunday'}
    all_days = pd.Categorical(df['day_of_week'].map(day_mapping), categories=day_mapping.values())
    day_dummies = pd.get_dummies(all_days).set_index(df.index)  # HERE

    # Create dummies for all months of the year
    month_mapping = {1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'jun', 7: 'jul',
                     8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov', 12: 'dec'}
    all_months = pd.Categorical(df['month'].map(month_mapping), categories=month_mapping.values())
    month_dummies = pd.get_dummies(all_months).set_index(df.index)  # HERE

    # Merge all dummies with original DataFrame
    df = pd.concat([df, hours, hour_dummies, day_dummies, month_dummies], axis=1)

    # Drop redundant columns
    df = df.drop(['hour', 'day_of_week', 'month'], axis=1)

    return df

:我认为hourshour_dummies是冗余的。

输出:

>>> make_dummies(df)
                            temp  weekend  hour_15  hour_16  hour_17  hour_18  hour_19  hour_15  hour_16  ...  apr  may  jun  jul  aug  sep  oct  nov  dec
Date                                                                                                      ...                                             
2023-04-07 15:00:00-05:00  53.13        0        1        0        0        0        0        1        0  ...    1    0    0    0    0    0    0    0    0
2023-04-07 16:00:00-05:00  52.93        0        0        1        0        0        0        0        1  ...    1    0    0    0    0    0    0    0    0
2023-04-07 17:00:00-05:00  52.56        0        0        0        1        0        0        0        0  ...    1    0    0    0    0    0    0    0    0
2023-04-07 18:00:00-05:00  51.58        0        0        0        0        1        0        0        0  ...    1    0    0    0    0    0    0    0    0
2023-04-07 19:00:00-05:00  47.57        0        0        0        0        0        1        0        0  ...    1    0    0    0    0    0    0    0    0

[5 rows x 31 columns]

相关问题