pandas 如何在panda Dataframe 中跳过列标题循环遍历各行

我正在开发一个ID3的实现。这个程序的目标在大纲中有说明。有三个步骤。
步骤1计算给定数据集传递的消息。
步骤2对数据集中的每个属性重复上述步骤。
给定两个数据集测试集

Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
1,6,4,4,4,1,1,1,6
2,5,4,4,4,2,6,1,1
1,6,2,1,4,1,4,2,4
1,6,2,1,4,1,2,1,2
2,6,5,5,5,2,2,1,2
1,5,4,4,4,1,6,2,2
1,3,3,3,3,1,6,2,2

训练集

Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
2,6,4,4,4,2,2,1,1
1,2,4,4,4,1,6,2,6
1,5,4,4,4,1,2,1,6
2,4,4,4,4,2,6,1,4
1,4,4,4,4,1,2,2,2
2,4,3,3,3,2,1,1,1
1,5,2,1,4,1,6,2,6
1,2,3,3,3,1,2,1,6
2,6,4,4,4,2,3,1,1

我想遍历每个数据集的行和列来计算属性的概率。

from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier

def calculate_metrics(tp, tn, fn, p, n, fp):
    # calculate the accuracy, error rate, sensitivity, specificity, and precision for the selected classifier in reference to the corresponding test set.
    accuracy = tp + tn /(p+n)
    error_rate = fp + fn /(p + n)
    sensitivity = tp/ p
    precision = tp/ (tp+fp)
    specificity = tn/n

    display_metrics(accuracy, error_rate, sensitivity, precision, specificity)

def display_metrics(accuracy, error_rate, sensitivity, precision, specificity):
    print(f'Accuracy: {accuracy}, Error_rate:{error_rate}, Sensitivity:{sensitivity}, Precision:{precision}, specificity:{specificity}')

def ID3(threshold,g):
    # use the training set to predict the test set.
    # use the Assignment 2--Training set to extract rules and test the quality of the extracted rules against the Assignment 2-- Test set for ID3.
    test_set = pd.read_csv("Assignment 2--Test set for ID3.csv")
    training_set = pd.read_csv("Assignment 2--Training set for ID3.csv")

    print(f'test_set: {test_set}')
    print(f'training_set: {training_set}')

    # Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute
    # MC = -p1*log2(p1) - p2*log2(p2)
    # For n classes MC = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn)

    # For each column
    # For each row
    # calculate the probability for an attribute

    for key, value in training_set.iteritems():
        if(isinstance(key, str)):
            print(f"Processing {key}")
            pass
        print("********")
        print(key, value)
        print()
        print("********")
    
    for key, value in test_set.iteritems():
        if(isinstance(key, str)):
            print(f"Processing {key}")
            pass
        print("********")
        print(key, value)
        print()
        print("********")

    # set gain array

    # Loop
        # Step 2 - Repeat for every attribute

        # i) use the atttribute as a node from which k 
        # k branches are emanating, where k is
        # the number of unique values in the attribute

        # ii) split the given data source based on the
        # unique values in the attribute

        # iii) calculate MC for new splits
        # calculate MC for each  attribute of Venue

        # iv calculculate the weight for each split
        # start with venue
        
        # v) calculate the weighted MC (WMC) for the attribute
        # WMC(venue) = W(1)*MC(1) + W(2)*MC(2)

        # vi) Calculate Gain for the attribute [MC-WMC(venue)]
        # Gain(venue) = MC-WMC(venue)

        # Step 3- Repeat for each split produced by the root
        # if all records have the same class then break. 

        # Step 4- If every split is free of a mixture of class values, then stop
        # expansion of the tree

        # Step 5- Extract rules in form of if-then-else from the tree
    
    # select the max value from the gain array
    # this is the new root


    # # leaf generated from the decision tree.
    # F1 = 0

    # # define c1 count of records w/ dominant class in F1
    # # How do I determine the number of records w/ dominant class in F1?
    # c1 = 0

    # # alpha = c1/ |F1|
    # # F1 is one of the unique values of a given attribute.
    # alpha = c1/ abs(F1)

    # # the number of records in the test set that are correctly classified by the rules extracted from the tree before removal.
    # # How do I determine the number of records in test set that are correctly classified by rules extracted from the tree before removal?
    # N = 0

    # # the number of records in the test set that are correctly classified by the rules extracted from the tree.
    # # How do I determine the number of records in the test set that are correctly classified by the rules extracted from the tree?
    # M = 0

    # # the parameter and 0 <= g <= 0.15
    # g = 0

    # if g < 0 or g > 0.15:
    #     exit()

    # # k is the total number of branches in the subtree
    # # How do I determine the total number of branches in the subtree?
    # k = 0

    # if alpha > threshold:
    #     # stop splitting tree

    # # How do we apply prepruning to the data?

    # # For post-pruning use the criteria below
    # if (N-M)/Q < g*k:
    #     # remove subtree
    
    # # true positive
    # tp = 0 
    # # true negative
    # tn = 0
    # # postive
    # p  = 0
    # #  negative
    # n  = 0
    # # false positive
    # fp = 0

    # calculate_metrics(tp, tn, p, n, fp)

def BayesClassifier():
    # use the assignment 2-- training set for Bayes as the training set to classify the records of the assignment 2 test set for bayes
    test_set = pd.read_csv("Assignment 2--Test set for Bayes.csv")
    training_set = pd.read_csv("Assignment 2--Training set for Bayes.csv")

# prompt user to select either ID3 or Bayes classifier.
selection = input("Please enter your selection for either ID3 or Bayes classification: ")
threshold = input("Please enter a threshold: ")
g         = input("Please enter a value for g: ")

if(selection == "ID3"):
    ID3(threshold,g)

if(selection == "Bayes"):
    BayesClassifier()

我需要帮助的地方是

# Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute
    # MC = -p1*log2(p1) - p2*log2(p2)
    # For n classes MC = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn)

    # For each column
    # For each row
    # calculate the probability for an attribute

    for key, value in training_set.iteritems():
        if(isinstance(key, str)):
            print(f"Processing {key}")
            pass
        print("********")
        print(key, value)
        print()
        print("********")
    
    for key, value in test_set.iteritems():
        if(isinstance(key, str)):
            print(f"Processing {key}")
            pass
        print("********")
        print(key, value)
        print()
        print("********")

预期：

********
Processing Volume
********
0     6
1     1
2     4
3     2
4     2
5     2
6     2
7     2
8     6
9     6
10    1
11    1
12    1
13    6
14    6
15    1
16    6
17    6
18    2
19    1
20    1
21    1
22    6
23    4
24    4
25    6
26    1
27    1
28    6
29    6
30    6
31    6
32    6
33    3
34    3
35    3
36    4
37    3
38    5
39    3
40    5
41    5
42    6
43    6
Name: Volume, dtype: int64

********

实际：

********
Processing Volume
********
Volume 0     6
1     1
2     4
3     2
4     2
5     2
6     2
7     2
8     6
9     6
10    1
11    1
12    1
13    6
14    6
15    1
16    6
17    6
18    2
19    1
20    1
21    1
22    6
23    4
24    4
25    6
26    1
27    1
28    6
29    6
30    6
31    6
32    6
33    3
34    3
35    3
36    4
37    3
38    5
39    3
40    5
41    5
42    6
43    6
Name: Volume, dtype: int64

********

尝试使用header=None执行此操作
结果

********
8 0     Volume
1          6
2          1
3          4
4          2
5          2
6          2
7          2
8          2
9          6
10         6
11         1
12         1
13         1
14         6
15         6
16         1
17         6
18         6
19         2
20         1
21         1
22         1
23         6
24         4
25         4
26         6
27         1
28         1
29         6
30         6
31         6
32         6
33         6
34         3
35         3
36         3
37         4
38         3
39         5
40         3
41         5
42         5
43         6
44         6
Name: 8, dtype: object

********

我不希望显示音量字符串。

我觉得你只需要把台词改一下

print(key, value)

到

print(value)

这样你就不用打印key变量中的字符串了。当你迭代 Dataframe 时，键将是string类型的列名，值将是pandas.Series类型的列，因此，整个列都用一个print(value)打印。另外，我想你可能把pass关键字误认为continue了。
最小示例：

import pandas as pd
from io import StringIO

data = StringIO("""Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
2,6,4,4,4,2,2,1,1
1,2,4,4,4,1,6,2,6
1,5,4,4,4,1,2,1,6
2,4,4,4,4,2,6,1,4
1,4,4,4,4,1,2,2,2
2,4,3,3,3,2,1,1,1
1,5,2,1,4,1,6,2,6
1,2,3,3,3,1,2,1,6
2,6,4,4,4,2,3,1,1""")

test_set = pd.read_csv(data)

for key, value in test_set.iteritems():
    if (isinstance(key, str)):
        print(f"Processing {key}")
    print("********")
    print(value)
    print()
    print("********")

输出：

Processing Venue
********
0    2
1    1
2    1
3    2
4    1
5    2
6    1
7    1
8    2
Name: Venue, dtype: int64

********
Processing color
********
0    6
1    2
2    5
3    4
4    4
5    4
6    5
7    2
8    6
Name: color, dtype: int64

********
Processing Model
********
0    4
1    4
2    4
3    4
4    4
5    3
6    2
7    3
8    4
Name: Model, dtype: int64

********
Processing Category
********
0    4
1    4
2    4
3    4
4    4
5    3
6    1
7    3
8    4
Name: Category, dtype: int64

********
Processing Location
********
0    4
1    4
2    4
3    4
4    4
5    3
6    4
7    3
8    4
Name: Location, dtype: int64

********
Processing weight
********
0    2
1    1
2    1
3    2
4    1
5    2
6    1
7    1
8    2
Name: weight, dtype: int64

********
Processing Veriety
********
0    2
1    6
2    2
3    6
4    2
5    1
6    6
7    2
8    3
Name: Veriety, dtype: int64

********
Processing Material
********
0    1
1    2
2    1
3    1
4    2
5    1
6    2
7    1
8    1
Name: Material, dtype: int64

********
Processing Volume
********
0    1
1    6
2    6
3    4
4    2
5    1
6    6
7    6
8    1
Name: Volume, dtype: int64

********

pandas 如何在panda Dataframe 中跳过列标题循环遍历各行

1条答案

相关问题

热门标签

最新问答