import pandas as pd
from fuzzywuzzy import fuzz
from recordlinkage.preprocessing import clean
from concurrent.futures import ThreadPoolExecutor
column1_name = 'Sold To Customer Name'
column2_name = 'customer_name'
try:
# Read datasets directly using read_csv
dataset1 = pd.read_csv(r"C:\Users\JE\Downloads\edw_query_extract_distinct.csv",low_memory=False,on_bad_lines='skip', index_col=False, dtype='unicode')
dataset2 = pd.read_csv(r"C:\Users\JE\Downloads\ISO_Code_joined.csv",low_memory=False,on_bad_lines='skip', index_col=False, dtype='unicode')
# Select columns to compare
column1 = dataset1[column1_name].astype(str)
column2 = dataset2[column2_name].astype(str)
# Matching threshold
threshold = 80
# Drop rows with NaN or empty values
column1 = column1.dropna().drop_duplicates().dropna()
column2 = column2.dropna().drop_duplicates().dropna()
matches = []
# Create index for column2
column2_index = column2.reset_index().values.tolist()
with ThreadPoolExecutor() as executor:
for i, value1 in column1.iteritems():
best_match = None
best_score = threshold
for j, value2 in column2_index:
`your text` s1 = clean(value1) # Preprocess strings for fuzzy matching
s2 = clean(value2)
score = fuzz.token_set_ratio(s1, s2) # Calculate fuzzy similarity
if score > best_score:
best_match = j
best_score = score
if best_match is not None:
matches.append((i, best_match, best_score))
matched_data = pd.DataFrame(matches, columns=['Index_1', 'Index_2', 'Similarity_Score'])
matched_data['Value_1'] = column1.loc[matched_data['Index_1']].values
matched_data['Value_2'] = column2.loc[matched_data['Index_2']].values
print(matched_data)
except FileNotFoundError:
print("Error: One or both dataset files not found.")
except KeyError:
print("Error: One or both column names are not present in the datasets.")
except Exception as e:
print(f"An error occurred: {str(e)}")
此代码返回发生错误:“str”对象没有属性“shape”
我的主要目标是比较来自两个不同数据集的两个不同列,它们可以用错误或缩写以及上述所有内容编写,并且结果数据集需要具有匹配的值。
1条答案
按热度按时间ecbunoof1#
该错误意味着在column1和column2变量上调用astype(str)方法时存在问题。据我所知,当astype()的输入不是Pandas DataFrame或Series对象时会发生此错误。
在您的代码中,dataset1和dataset2似乎已经是DataFrames了,因此不需要在cols上调用astype(str)。您可能可以修改代码以删除column1和column2上的astype(str)调用。