import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import os
csv_dir = r"C:\Users\xyz\Desktop\CSVModel\preparing_dataset\Seperated\Airline\csvs"
datasets = []
column_mapping = {
"Bar Chart_1": ["Nationality", "First Name"],
"Bar Chart_2": ["Country Name", "Flight Status"],
"Line Chart_1": ["Name", "Departure data"],
"Pie Chart_1": ["Flight Status"],
"Pie Chart_2": ["Continents", "First Name"],
"Stacked Bar Chart_1": ["Continents", "Gender"]
}
for filename in os.listdir(csv_dir):
if filename.endswith(".csv"):
file_path = os.path.join(csv_dir, filename)
chart_type = filename.split("_")[1] # Assumes the pattern "Airline_ChartType_1.csv"
data = pd.read_csv(file_path)
column_names = data.columns.tolist()
print(f"Processing file: {filename}, Chart Type: {chart_type}")
print(f"Column Names: {column_names}")
columns_to_use = column_mapping.get(chart_type, [])
features = data[columns_to_use]
if not features.empty:
min_rows = features.shape[0]
dataset = pd.DataFrame({'DatasetID': range(1, min_rows + 1),
'Features': [list(row) for row in features.to_numpy()],
'GraphType': [chart_type] * min_rows})
datasets.append(dataset)
if datasets:
final_dataset = pd.concat(datasets, ignore_index=True)
print(final_dataset)
else:
print("No valid datasets were created.")
我的“columns_to_use”变量是空的,即使我已经交叉检查了我的列和CSV文件的名称
我想从我不同的csv文件中提取各种功能。每个csv文件都有不同的功能,由于某种原因,它不阅读我的列。当我单独阅读这些文件时,它的工作非常完美。请帮助我理解这个问题
我得到的输出:
Processing file: Airline_Bar Chart_1.csv, Chart Type: Bar Chart
Column Names: ['Nationality', 'First Name']
Processing file: Airline_Bar Chart_2.csv, Chart Type: Bar Chart
Column Names: ['Country Name', 'Flight Status']
Processing file: Airline_Line Chart_1.csv, Chart Type: Line Chart
Column Names: ['First Name', 'Departure Date']
Processing file: Airline_Pie Chart_1.csv, Chart Type: Pie Chart
Column Names: ['Flight Status']
Processing file: Airline_Pie Chart_2.csv, Chart Type: Pie Chart
Column Names: ['Continents', 'First Name']
Processing file: Airline_Stacked Bar Chart_1.csv, Chart Type: Stacked Bar Chart
Column Names: ['Continents', 'Gender']
No valid datasets were created.
1条答案
按热度按时间68bkxrlz1#
我明白问题所在了。
如果
chart_type
(根据您的输出)没有_1
或_2
,则在执行columns_to_use = column_mapping.get(chart_type, [])
时,它始终默认为[]
您应该
chart_type = filename.split("_", 1)[1]
只在第一个_
上拆分。现在,从
Airline_ChartType_1
可以得到['Airline', 'ChartType', '1']
您希望从
Airline_ChartType_1
获取['Airline', 'ChartType_1']