keras 如何在Python中加载数据集并处理它而不过载RAM?

55ooxyrt  于 2023-06-23  发布在  Python
关注(0)|答案(1)|浏览(109)

我的TensorFlow和Keras LSTM模型在将数据集扩展到3.95 MB后,每次尝试打开Kaggle开始训练过程时都会因RAM过载而崩溃。我发现数据集太重了,无法一次加载,即使使用数据加载器,也会搞砸训练。我一直在寻找解决办法,但我找不到。任何支持将不胜感激。

from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback, ModelCheckpoint, ReduceLROnPlateau
import random
import sys

with open('/kaggle/input/crptic-python/dataset.txt', 'r') as file:
    text = file.read()

# A preview of the text file
vocabulary = sorted(list(set(text)))

char_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_char = dict((i, c) for i, c in enumerate(vocabulary))

# Dividing the text into subsequences of length max_length
# So that at each time step the next max_length characters
# are fed into the network
max_length = 100
steps = 5
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length])
    next_chars.append(text[i + max_length])

# Hot encoding each character into a boolean vector
X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype=bool)
y = np.zeros((len(sentences), len(vocabulary)), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_indices[char]] = 1
    y[i, char_to_indices[next_chars[i]]] = 1

# Building the LSTM network for the task
model = Sequential()
model.add(LSTM(128, input_shape=(max_length, len(vocabulary))))
model.add(Dense(len(vocabulary)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

# Helper function to sample an index from a probability array
def sample_index(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Helper function to generate text after the end of each epoch
def on_epoch_end(epoch, logs):
    if epoch % 30 == 0:
        print()
        print('----- Generating text after Epoch: % d' % epoch)

        start_index = random.randint(0, len(text) - max_length - 1)
        for diversity in [0.2, 0.5, 1.0, 1.2]:
            print('----- diversity:', diversity)

            generated = ''
            sentence = text[start_index: start_index + max_length]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)

            for i in range(400):
                x_pred = np.zeros((1, max_length, len(vocabulary)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_to_indices[char]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample_index(preds, diversity)
                next_char = indices_to_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

# Defining a helper function to save the model after each epoch
# in which the loss decreases
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')

# Defining a helper function to reduce the learning rate each time
# the learning plateaus
reduce_alpha = ReduceLROnPlateau(monitor='loss', factor=0.2,
                                 patience=1, min_lr=0.001)
callbacks = [print_callback, checkpoint, reduce_alpha]

# Training the LSTM model
model.fit(X, y, batch_size=128, epochs=28, callbacks=callbacks)

def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - max_length - 1)
    generated = ''
    sentence = text[start_index: start_index + max_length]
    generated += sentence
    for i in range(length):
        x_pred = np.zeros((1, max_length, len(vocabulary)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_to_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample_index(preds, diversity)
        next_char = indices_to_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char
    return generated

print(generate_text(500, 0.5))

数据集是一个文本文件,包含无组织的单词和短语。此AI用作自动完成实现。

ve7v8dk2

ve7v8dk21#

你可以编写自定义的python生成器,通过使用关键字yield来有效地将数据加载到模型中。我不知道你是否曾经实现过类似的东西,但这个概念在构建这个生成器时是直观的。所以写一个函数,里面有你需要的所有参数,比如文件的路径,batch_size,长度词汇表,...然后做一个无限循环,在里面写你的代码/逻辑来加载/写入数据到批处理中,然后把它们推到模型中。
我写了这个片段,但肯定它不够准确,你需要审查它:

def text_generator(batch_size, max_length, steps, len_vocabulary):
    while True:
        with open('/kaggle/input/crptic-python/dataset.txt', 'r') as file:
            text = file.read()
            sentences = []
            next_chars = []
            for i in range(0, len(text) - max_length, steps):
                sentences.append(text[i: i + max_length])
                next_chars.append(text[i + max_length])
                if len(sentences) == batch_size:

                    # Hot encoding each character into a boolean vector
                    X = np.zeros((len(sentences), max_length, len_vocabulary), dtype=bool)
                    y = np.zeros((len(sentences), len_vocabulary), dtype=bool)
                    for i, sentence in enumerate(sentences):
                        for t, char in enumerate(sentence):
                            X[i, t, char_to_indices[char]] = 1
                        y[i, char_to_indices[next_chars[i]]] = 1
                    yield X, y
                    sentence = []
                    next_chars = []

相关问题