python 两数据集模糊搜索问题

jdg4fx2g  于 2023-06-28  发布在  Python
关注(0)|答案(1)|浏览(115)
import pandas as pd
from fuzzywuzzy import fuzz
from recordlinkage.preprocessing import clean
from concurrent.futures import ThreadPoolExecutor

column1_name = 'Sold To Customer Name'
column2_name = 'customer_name'

try:
    # Read datasets directly using read_csv
    dataset1 = pd.read_csv(r"C:\Users\JE\Downloads\edw_query_extract_distinct.csv",low_memory=False,on_bad_lines='skip', index_col=False, dtype='unicode')

    dataset2 = pd.read_csv(r"C:\Users\JE\Downloads\ISO_Code_joined.csv",low_memory=False,on_bad_lines='skip', index_col=False, dtype='unicode')

    # Select columns to compare
    column1 = dataset1[column1_name].astype(str)
    column2 = dataset2[column2_name].astype(str)

    # Matching threshold
    threshold = 80

    # Drop rows with NaN or empty values
    column1 = column1.dropna().drop_duplicates().dropna()
    column2 = column2.dropna().drop_duplicates().dropna()

    matches = []

    # Create index for column2
    column2_index = column2.reset_index().values.tolist()

    with ThreadPoolExecutor() as executor:
        for i, value1 in column1.iteritems():
            best_match = None
            best_score = threshold

            for j, value2 in column2_index:
           `your text`     s1 = clean(value1)  # Preprocess strings for fuzzy matching
                s2 = clean(value2)
                score = fuzz.token_set_ratio(s1, s2)  # Calculate fuzzy similarity

                if score > best_score:
                    best_match = j
                    best_score = score

            if best_match is not None:
                matches.append((i, best_match, best_score))

    matched_data = pd.DataFrame(matches, columns=['Index_1', 'Index_2', 'Similarity_Score'])
    matched_data['Value_1'] = column1.loc[matched_data['Index_1']].values
    matched_data['Value_2'] = column2.loc[matched_data['Index_2']].values

    print(matched_data)

except FileNotFoundError:
    print("Error: One or both dataset files not found.")
except KeyError:
    print("Error: One or both column names are not present in the datasets.")
except Exception as e:
    print(f"An error occurred: {str(e)}")

此代码返回发生错误:“str”对象没有属性“shape”
我的主要目标是比较来自两个不同数据集的两个不同列,它们可以用错误或缩写以及上述所有内容编写,并且结果数据集需要具有匹配的值。

ecbunoof

ecbunoof1#

该错误意味着在column1和column2变量上调用astype(str)方法时存在问题。据我所知,当astype()的输入不是Pandas DataFrame或Series对象时会发生此错误。
在您的代码中,dataset1和dataset2似乎已经是DataFrames了,因此不需要在cols上调用astype(str)。您可能可以修改代码以删除column1和column2上的astype(str)调用。

import pandas as pd
from fuzzywuzzy import fuzz
from recordlinkage.preprocessing import clean
from concurrent.futures import ThreadPoolExecutor

column1_name = 'Sold To Customer Name'
column2_name = 'customer_name'

try:
    # Read datasets directly using read_csv
    dataset1 = pd.read_csv(r"C:\Users\JE\Downloads\edw_query_extract_distinct.csv", low_memory=False, on_bad_lines='skip', index_col=False, dtype='unicode')
    dataset2 = pd.read_csv(r"C:\Users\JE\Downloads\ISO_Code_joined.csv", low_memory=False, on_bad_lines='skip', index_col=False, dtype='unicode')

    # Select columns to compare
    column1 = dataset1[column1_name]
    column2 = dataset2[column2_name]

    # Matching threshold
    threshold = 80

    # Drop rows with NaN or empty values
    column1 = column1.dropna().drop_duplicates().dropna()
    column2 = column2.dropna().drop_duplicates().dropna()

    matches = []

    # Create index for column2
    column2_index = column2.reset_index().values.tolist()

    with ThreadPoolExecutor() as executor:
        for i, value1 in column1.iteritems():
            best_match = None
            best_score = threshold

            for j, value2 in column2_index:
                s1 = clean(value1)  # Preprocess strings for fuzzy matching
                s2 = clean(value2)
                score = fuzz.token_set_ratio(s1, s2)  # Calculate fuzzy similarity

                if score > best_score:
                    best_match = j
                    best_score = score

            if best_match is not None:
                matches.append((i, best_match, best_score))

    matched_data = pd.DataFrame(matches, columns=['Index_1', 'Index_2', 'Similarity_Score'])
    matched_data['Value_1'] = column1.loc[matched_data['Index_1']].values
    matched_data['Value_2'] = column2.loc[matched_data['Index_2']].values

    print(matched_data)

except Exception as e:
    print(f"An error occurred: {str(e)}")

相关问题