pandas 足够相似时返回字符串差异

def get_intersection(descr1, descr2):
    if pd.isna(descr1) or pd.isna(descr2):
        return set()
    return set(descr1.split()).intersection(set(descr2.split()))

def get_unique_words(descr, intersection):
    unique_words = " ".join(
        word for word in descr.split() if word not in intersection
    )
    if len(unique_words) > 0:
        return unique_words

def get_unique_description(row):
    if len(row["next_product_intersection"]) == 0 and len(row["prev_product_intersection"]) == 0:
        return row["Product Description"]
    
    if len(row["next_product_intersection"]) >= len(row["prev_product_intersection"]):
        return row["next_product_unique_words"]
    
    return row["prev_product_unique_words"]

df["next_product"] = df["Product Description"].shift(-1)
df["prev_product"] = df["Product Description"].shift(1)

df["next_product_intersection"] = df.apply(
    lambda row: get_intersection(row["Product Description"], row["next_product"]),
    axis=1
)
df["prev_product_intersection"] = df.apply(
    lambda row: get_intersection(row["Product Description"], row["prev_product"]),
    axis=1
)

df["next_product_unique_words"] = df.apply(
    lambda row: get_unique_words(row["Product Description"], row["next_product_intersection"]),
    axis=1
)
df["prev_product_unique_words"] = df.apply(
    lambda row: get_unique_words(row["Product Description"], row["prev_product_intersection"]),
    axis=1
)

df["Variance"] = df.apply(get_unique_description, axis=1)
df = df[["Product Description", "Variance"]]
print(df)

字符串
如何将这个过滤器添加到该框架中？
先谢谢你了。

你可以在下面尝试，但是正如@mozway 所指出的，很容易找到一个反例！

from difflib import SequenceMatcher
from itertools import chain, permutations

def fn(x, y, N=3):
    matches = SequenceMatcher(None, x , y).get_matching_blocks()[:-1]
    mchunks = list(chain.from_iterable(
        [x[m.a: m.a + m.size] for m in matches]))
    return mchunks if len(mchunks) >= N else None

descs = df["Product Description"].str.split(r"\s+(?![^[\(]*\))")

d = {" ".join(s1): fn(s1, s2) for s1, s2 in
     permutations(descs, r=2) if fn(s1, s2)}

_map = df["Product Description"].map(d).fillna("")

df["Variance"] = [set(des).difference(m).pop()
                  if m and set(des).difference(m)
                  else pd.NA for des, m in zip(descs, _map)]

字符串
输出量：

print(df)

                 Product Description      Variance
0            Petzl Red HMS Carabiner           Red
1           Petzl Blue HMS Carabiner          Blue
2         Petzl HMS Carabiner Orange        Orange
3              Petzl Green Carabiner          <NA>
4             Petzl Purple Carabiner          <NA>
5               Liquid Chalk - 100ml          <NA>
6  Liquid Chalk - 100ml (Case of 10)  (Case of 10)

型

pandas 足够相似时返回字符串差异

1条答案

相关问题

热门标签

最新问答