pandas 通过Python生成的TDM中的标头问题

我的Python TDM遇到了一些问题。现在，它接受从我的其他应用程序生成的CSV，然后从中创建一个术语文档矩阵。当前的问题是字典中频率为0的一些单词仍然出现在标题中。
This is the current output所以在这种情况下，像one，simply，focus，money等一直到右边的单词不应该在创建的TDM文件中添加/显示。

def termDocumentMatrix():
    # Get filenames of CSV files
    filenames = filedialog.askopenfilename(
        title="Datafluent | Open CSV files for TDM", filetypes=[("Comma Separated Value", "*.csv")]
    )

    # Check file paths
    absolute_path = os.path.dirname(__file__)
    relative_path = "temp/upload/to_tdm"
    folderdir = os.path.join(absolute_path, relative_path)

    # Set new filename for generated CSV
    new_filename = Path(filenames).stem
    new_filename = new_filename.replace(' ', '_')

    # Upload file to temp folder
    try:
        copyfile_tdm(filenames, folderdir)
    except:
        mb.showerror(title="Error!", message="File can't be Opened! Might be Wrong Format or Damaged!")

    # Read raw data from file
    data = pd.read_csv(filenames, header=None)
    tdmfile = data[0].str.cat(sep=' ')

    # Clean data by removing commas and new lines
    tdmfile = tdmfile.replace(",", "")
    tdmfile = tdmfile.replace("\\n", "")

    # Create Lemmatization Object
    lemmatizer = WordNetLemmatizer()

    # Tokenize text into sentences
    tokenizer = sent_tokenize(tdmfile)

    # Lemmatize words to get their proper meaning and remove stop words
    lemmawords = []
    for sentence in tokenizer:
        # Convert non-alphabetic characters to spaces
        sentence = re.sub('[^a-zA-Z]', ' ', sentence)
        tokens = word_tokenize(sentence.lower())
        # Remove stop words and lemmatize remaining words
        lemmawords += [lemmatizer.lemmatize(token) for token in tokens if token not in set(stopwords.words('english'))]

    # Create bag of words dictionary and filter out words with low frequency
    MIN_FREQUENCY = 2
    word_counts = Counter(lemmawords)
    dictionary = {word: i for i, word in enumerate(word_counts.keys()) if word_counts[word] >= MIN_FREQUENCY}

    # Build bag of words model
    sentence_vectors = []
    for sentence in tokenizer:
        sentence_words = set(word_counts.keys()).intersection(set(word_tokenize(sentence)))
        vector = [word_counts[word] for word in sentence_words if word in dictionary]
        sentence_vectors.append(vector)

    sentence_vectors = np.asarray(sentence_vectors)

    # Write output to CSV file
    output_path = f"{new_filename}_TDM.csv"
    with open(output_path, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        header = [word for word in dictionary.keys() if word in set(lemmawords)]
        writer.writerow(header)
        for row in sentence_vectors:
            if not all(x == 0 for x in row):
                writer.writerow(row)

    # Open output file
    os.system(f'start {output_path}')

我试过几次修复，但结果情况变得更糟。我试着将sentence_vectors和header_Words一起构建，但没有成功。我试着调整头球，但也没有成功。

您可以用途：

# Tokenize text into sentences
tokenizer = sent_tokenize(tdmfile)

# New code from here
MIN_FREQUENCY = 2
stop_words = stopwords.words('english')

bags = []
for sentence in tokenizer:
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    tokens = [lemmatizer.lemmatize(token)
                  for token in word_tokenize(sentence.lower())
                  if token not in stop_words]
    bags.append(Counter(tokens))

bags = pd.DataFrame(bags).fillna(0).astype(int)
bags = bags.loc[: , bags.sum() > MIN_FREQUENCY]

# Export to file
bags.replace(0, '').to_csv(f'{new_filename}_TDM.csv', index=False)

输出：

>>> bags
     document  entrepreneur  businessman  choose  ten  make  given  answer  ...  attack  company  trick  cybercriminals  personal  vulnerable  sensitive  apps
0           1             2            1       1    1     1      1       1  ...       0        0      0               0         0           0          0     0
1           0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     0
2           0             1            1       0    0     0      0       0  ...       0        0      0               0         0           0          0     0
3           0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     0
4           0             0            0       0    0     1      0       0  ...       0        0      0               0         0           0          0     0
..        ...           ...          ...     ...  ...   ...    ...     ...  ...     ...      ...    ...             ...       ...         ...        ...   ...
267         0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     0
268         0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     0
269         0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          1     0
270         0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     1
271         0             0            0       0    0     0      0       0  ...       0        0      0               0         0           0          0     0

[272 rows x 337 columns]

pandas 通过Python生成的TDM中的标头问题

1条答案

相关问题

热门标签

最新问答