python 两数据集模糊搜索问题

jdg4fx2g 于 2023-06-28 发布在 Python

关注(0)|答案(1)|浏览(115)

import pandas as pd
from fuzzywuzzy import fuzz
from recordlinkage.preprocessing import clean
from concurrent.futures import ThreadPoolExecutor

column1_name = 'Sold To Customer Name'
column2_name = 'customer_name'

try:
    # Read datasets directly using read_csv
    dataset1 = pd.read_csv(r"C:\Users\JE\Downloads\edw_query_extract_distinct.csv",low_memory=False,on_bad_lines='skip', index_col=False, dtype='unicode')

    dataset2 = pd.read_csv(r"C:\Users\JE\Downloads\ISO_Code_joined.csv",low_memory=False,on_bad_lines='skip', index_col=False, dtype='unicode')

    # Select columns to compare
    column1 = dataset1[column1_name].astype(str)
    column2 = dataset2[column2_name].astype(str)

    # Matching threshold
    threshold = 80

    # Drop rows with NaN or empty values
    column1 = column1.dropna().drop_duplicates().dropna()
    column2 = column2.dropna().drop_duplicates().dropna()

    matches = []

    # Create index for column2
    column2_index = column2.reset_index().values.tolist()

    with ThreadPoolExecutor() as executor:
        for i, value1 in column1.iteritems():
            best_match = None
            best_score = threshold

            for j, value2 in column2_index:
           `your text`     s1 = clean(value1)  # Preprocess strings for fuzzy matching
                s2 = clean(value2)
                score = fuzz.token_set_ratio(s1, s2)  # Calculate fuzzy similarity

                if score > best_score:
                    best_match = j
                    best_score = score

            if best_match is not None:
                matches.append((i, best_match, best_score))

    matched_data = pd.DataFrame(matches, columns=['Index_1', 'Index_2', 'Similarity_Score'])
    matched_data['Value_1'] = column1.loc[matched_data['Index_1']].values
    matched_data['Value_2'] = column2.loc[matched_data['Index_2']].values

    print(matched_data)

except FileNotFoundError:
    print("Error: One or both dataset files not found.")
except KeyError:
    print("Error: One or both column names are not present in the datasets.")
except Exception as e:
    print(f"An error occurred: {str(e)}")

此代码返回发生错误：“str”对象没有属性“shape”
我的主要目标是比较来自两个不同数据集的两个不同列，它们可以用错误或缩写以及上述所有内容编写，并且结果数据集需要具有匹配的值。

python

来源：https://stackoverflow.com/questions/76568708/fuzzy-search-in-two-datasets-issue

1条答案

按热度按时间

ecbunoof1#

该错误意味着在column1和column2变量上调用astype（str）方法时存在问题。据我所知，当astype（）的输入不是Pandas DataFrame或Series对象时会发生此错误。
在您的代码中，dataset1和dataset2似乎已经是DataFrames了，因此不需要在cols上调用astype（str）。您可能可以修改代码以删除column1和column2上的astype（str）调用。

import pandas as pd
from fuzzywuzzy import fuzz
from recordlinkage.preprocessing import clean
from concurrent.futures import ThreadPoolExecutor

column1_name = 'Sold To Customer Name'
column2_name = 'customer_name'

try:
    # Read datasets directly using read_csv
    dataset1 = pd.read_csv(r"C:\Users\JE\Downloads\edw_query_extract_distinct.csv", low_memory=False, on_bad_lines='skip', index_col=False, dtype='unicode')
    dataset2 = pd.read_csv(r"C:\Users\JE\Downloads\ISO_Code_joined.csv", low_memory=False, on_bad_lines='skip', index_col=False, dtype='unicode')

    # Select columns to compare
    column1 = dataset1[column1_name]
    column2 = dataset2[column2_name]

    # Matching threshold
    threshold = 80

    # Drop rows with NaN or empty values
    column1 = column1.dropna().drop_duplicates().dropna()
    column2 = column2.dropna().drop_duplicates().dropna()

    matches = []

    # Create index for column2
    column2_index = column2.reset_index().values.tolist()

    with ThreadPoolExecutor() as executor:
        for i, value1 in column1.iteritems():
            best_match = None
            best_score = threshold

            for j, value2 in column2_index:
                s1 = clean(value1)  # Preprocess strings for fuzzy matching
                s2 = clean(value2)
                score = fuzz.token_set_ratio(s1, s2)  # Calculate fuzzy similarity

                if score > best_score:
                    best_match = j
                    best_score = score

            if best_match is not None:
                matches.append((i, best_match, best_score))

    matched_data = pd.DataFrame(matches, columns=['Index_1', 'Index_2', 'Similarity_Score'])
    matched_data['Value_1'] = column1.loc[matched_data['Index_1']].values
    matched_data['Value_2'] = column2.loc[matched_data['Index_2']].values

    print(matched_data)

except Exception as e:
    print(f"An error occurred: {str(e)}")

赞(0）回复(0）举报 2023-06-28

我来回答

python 两数据集模糊搜索问题

1条答案

相关问题

热门标签

最新问答