我有一个调查数据集,我需要将其中的一部分扩展为列,其中包含对所问评级问题的文本响应。数据集很大,最好的方法是什么?
import pandas as pd
pd.DataFrame({'S.No': {0: 63.0,
1: nan,
2: nan,
3: nan,
4: 204.0,
5: nan,
6: nan,
7: nan,
8: 238.0,
9: nan,
10: nan,
11: nan,
12: 292.0,
13: nan,
14: nan,
15: nan,
16: 332.0,
17: nan,
18: nan,
19: nan},
'ID': {0: 251,
1: 251,
2: 251,
3: 251,
4: 252,
5: 252,
6: 252,
7: 252,
8: 253,
9: 253,
10: 253,
11: 253,
12: 254,
13: 254,
14: 254,
15: 254,
16: 255,
17: 255,
18: 255,
19: 255},
'Name': {0: 'Bob',
1: 'Bob',
2: 'Bob',
3: 'Bob',
4: 'Foo',
5: 'Foo',
6: 'Foo',
7: 'Foo',
8: 'Mike',
9: 'Mike',
10: 'Mike',
11: 'Mike',
12: 'Mary',
13: 'Mary',
14: 'Mary',
15: 'Mary',
16: 'Bar',
17: 'Bar',
18: 'Bar',
19: 'Bar'},
'User Function': {0: 'Sales',
1: 'Sales',
2: 'Sales',
3: 'Sales',
4: 'Mktg',
5: 'Mktg',
6: 'Mktg',
7: 'Mktg',
8: 'Finance',
9: 'Finance',
10: 'Finance',
11: 'Finance',
12: 'Sales',
13: 'Sales',
14: 'Sales',
15: 'Sales',
16: 'Mktg',
17: 'Mktg',
18: 'Mktg',
19: 'Mktg'},
'Business Unit': {0: 'BU1',
1: 'BU1',
2: 'BU1',
3: 'BU1',
4: 'BU2',
5: 'BU2',
6: 'BU2',
7: 'BU2',
8: 'BU3',
9: 'BU3',
10: 'BU3',
11: 'BU3',
12: 'BU1',
13: 'BU1',
14: 'BU1',
15: 'BU1',
16: 'BU2',
17: 'BU2',
18: 'BU2',
19: 'BU2'},
'Gender': {0: 'Male',
1: 'Male',
2: 'Male',
3: 'Male',
4: 'Male',
5: 'Male',
6: 'Male',
7: 'Male',
8: 'Male',
9: 'Male',
10: 'Male',
11: 'Male',
12: 'Female',
13: 'Female',
14: 'Female',
15: 'Female',
16: 'Male',
17: 'Male',
18: 'Male',
19: 'Male'},
'Primary Exit Reason': {0: 'Policy',
1: 'Policy',
2: 'Policy',
3: 'Policy',
4: 'Team',
5: 'Team',
6: 'Team',
7: 'Team',
8: 'Navigation',
9: 'Navigation',
10: 'Navigation',
11: 'Navigation',
12: 'Others',
13: 'Others',
14: 'Others',
15: 'Others',
16: 'Policy',
17: 'Policy',
18: 'Policy',
19: 'Policy'},
'Primary Question': {0: 'A',
1: 'B',
2: 'C',
3: 'D',
4: 'E',
5: 'F',
6: 'G',
7: 'H',
8: 'I',
9: 'J',
10: 'K',
11: 'L',
12: 'M',
13: 'N',
14: 'O',
15: 'P',
16: 'A',
17: 'B',
18: 'C',
19: 'D'},
'Primary Response': {0: nan,
1: nan,
2: nan,
3: nan,
4: 'Agree',
5: 'Agree',
6: 'No',
7: nan,
8: 'Agree',
9: 'Agree',
10: 'No',
11: nan,
12: nan,
13: nan,
14: nan,
15: nan,
16: nan,
17: nan,
18: nan,
19: nan},
'Secondary Exit Reason': {0: 'Policy',
1: 'Policy',
2: 'Policy',
3: 'Policy',
4: 'Others',
5: 'Others',
6: 'Others',
7: 'Others',
8: 'Transport',
9: 'Transport',
10: 'Transport',
11: 'Transport',
12: 'Policy',
13: 'Policy',
14: 'Policy',
15: 'Policy',
16: 'Policy',
17: 'Policy',
18: 'Policy',
19: 'Policy'},
'Secondary Question': {0: 'A',
1: 'B',
2: 'C',
3: 'D',
4: 'M',
5: 'N',
6: 'O',
7: 'P',
8: 'Q',
9: 'R',
10: 'S',
11: 'T',
12: 'A',
13: 'B',
14: 'C',
15: 'D',
16: 'A',
17: 'B',
18: 'C',
19: 'D'},
'Secondary Response': {0: 'Agree',
1: 'Agree',
2: 'Yes',
3: nan,
4: nan,
5: nan,
6: nan,
7: nan,
8: nan,
9: nan,
10: nan,
11: nan,
12: 'Agree',
13: 'Agree',
14: 'No',
15: nan,
16: 'Highly Agree',
17: 'Agree',
18: "I'm unaware",
19: nan},
'Feedback Question': {0: 'A',
1: 'B',
2: 'C',
3: 'D',
4: 'A',
5: 'B',
6: 'C',
7: 'D',
8: 'A',
9: 'B',
10: 'C',
11: 'D',
12: 'A',
13: 'B',
14: 'C',
15: 'D',
16: 'A',
17: 'B',
18: 'C',
19: 'D'},
'Feedback Reason': {0: '4',
1: '8',
2: nan,
3: nan,
4: '4',
5: '7',
6: 'NO ALL GOOD',
7: 'NO\n',
8: '4',
9: '6',
10: 'No',
11: 'No',
12: '5',
13: '6',
14: 'No',
15: 'No',
16: '5',
17: '10',
18: 'YES GOOD XP',
19: 'Yes'}})
上面是一个可复制的数据集。需要将Primary Question
、Secondary Question
和Feedback Question
展开为列,将Primary Response
、Secondary Response
和Feedback Response
作为这些列的值。其余所有变量都不会改变,因此保留在一行中,例如'S.No'
,'ID'
,'Name'
,'User Function'
,'Business Unit'
,'Gender'
,'Issues'。每个ID由4行组成
我尝试使用groupby()
ID,然后unstack()
-ing所需的列,但没有工作。也尝试了melt
和pivot
。
df.melt(id_vars=['S.No','ID','Name','User Function','Business Unit','Gender','Primary Exit Reason'],
var_name='var')
我不知道如何将每一列分成单独的列
预期产出:
1条答案
按热度按时间lhcgjxsq1#
这是你所期望的输出,请注意,我只是从宽到长的反馈原因,次要响应和主要响应转换,因为它们包含的信息我可以很容易地识别。
这些步骤是可重复的,可供您添加其他列。
附加这里有一个函数,可以使代码更具可读性和可复制性。
您可以通过
result_df = transform_column('Feedback Reason')
使用该函数