python – How to setup LSTM to use n-grams instead of sequence length?

To provide sequences of words instead of characters to an LSTM and have the LSTM predict words, the text will have to be tokenized differently. Instead of splitting the document into sequences of n characters, the document will have to be split into m words, and instead of having a vocabulary of distinct characters (the index mappings), you will have a vocabulary of distinct words from your corpus. I have revised the provided code to illustrate this idea for your review. Some of the parameters were modified for testing purposes (I created my own text file for evaluation), but it can be changed for your document(s).

# Setup
import re

import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import LSTM, Dropout
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import RMSprop
import matplotlib


matplotlib.use('agg')

import heapq

# Loading the data
path="text_2.txt"
text = open(path, encoding='utf8').read().lower()
# print ('Corpus length: ',len(text))

# Preprocessing
# Finding all the unique characters in the corpus
text = re.sub(r'[^ws]', '', text)  # remove punctuation
words = sorted(list(set(text.split()))) # split into distinct words
# define vocabulary
word_indices = dict((c, i) for i, c in enumerate(words))
indices_words = dict((i, c) for i, c in enumerate(words))


print("unique chars: ", len(words))

# Cutting the corpus into chunks of 10 chars, spacing the sequences by 3 characters
# We will additionally store the next character (the one we need to predict) for every sequence

SEQUENCE_LENGTH = 3
step = 3
sentences = []
next_chars = []
for i in range(0, len(words) - SEQUENCE_LENGTH, step):
    sentences.append(words[i:i + SEQUENCE_LENGTH])
    next_chars.append(words[i + SEQUENCE_LENGTH])
print('num training examples: ', len(sentences))

# Generating features and labels.
# Using previously generated sequences and characters that need to be predicted to create one-hot encoded vectors

X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(words)), dtype=np.bool)
y = np.zeros((len(sentences), len(words)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence):
        X[i, t, word_indices[word]] = 1
    y[i, word_indices[next_chars[i]]] = 1

# Building the model

model = Sequential()
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, len(words))))
model.add(Dense(len(words)))
model.add(Activation('softmax'))

# Training
optimizer = RMSprop(lr=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=['accuracy'])

history = model.fit(X, y, validation_split=0.05, batch_size=128, epochs=1, shuffle=True).history


# Predicting

# Testing
def prepare_input(text):
    x = np.zeros((1, SEQUENCE_LENGTH, len(words)))
    for t, word in enumerate(text.split()):
        x[0, t, word_indices[word]] = 1
    return x


# The sequences must be 40 chars long and the tensor is of the shape (1, 40, 57)


# The sample function
# This function allows us to ask our model what are the next probable characters (The heap simplifies the job)
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)


# Prediction function
def predict_completion(text):
    prediction = []
    while len(prediction) < SEQUENCE_LENGTH:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_word = indices_words[next_index]

        text = " ".join([text, next_word])
        prediction += [next_word]

    return " ".join(prediction)


# This methods wraps everything and allows us to predict multiple completions
def predict_completions(text, n=3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_words[idx] + predict_completion(text[1:] + indices_words[idx]) for idx in next_indices]

print(predict_completion("hello"))

The text file (text_2.txt), for evaluating the tokenization step and verifying it can be input to the model correctly, was:

hello. how are you today? i am doing well thank you for asking. i like sweet tea.

I may have misunderstood the part of the question. Regarding how to train on n-grams, you can compute the n-grams (eg; bigrams, trigrams) and add them as part of your training data, using the next word after the n-gram as the prediction. The tokenization updates are the same, but we would revise the preprocessing of the corpus:

def compute_n_gram(words, n):
    # compute n-grams
    return [words[i:i+n] for i in range(len(words)-n+1)]


def compute_n_gram_with_next_word(words, n):
    n_gram = compute_n_gram(words, n)[:-1]
    next_words = [words[i+n] for i in range(len(n_gram))]
    return n_gram, next_words

words = text.split()
vocab = sorted(list(set(words)))  # determine distinct words from corpus
vocab_size = len(vocab)

bigrams, next_word_bigrams = compute_n_gram_with_next_word(words, 2)
trigrams, next_word_trigrams = compute_n_gram_with_next_word(words, 3)
sentences = bigrams + trigrams
next_words = next_word_bigrams + next_word_trigrams

So now, rather than taking every subsentence of three words and predicting the fourth, we use the bigrams and trigrams and predict the third or fourth word, respectively. This n-gram logic will also apply if you decide to do higher orders (four-gram, five-gram, etc.). SEQUENCE_LENGTH has to be greater than or equal to the maximum n-gram size you use, as it controls the size of your input (and any shorter input is padded with zeros). It also controls your output size with the current prediction code, as the generated text can’t exceed that length (unless you opt to use a windowing approach). The entire updated code is similar to the previous snippet, but for completeness:

# Setup
import re

import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import LSTM, Dropout
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import RMSprop
import matplotlib
from tensorflow.python.keras import Input

matplotlib.use('agg')

import heapq

def compute_n_gram(words, n):
    # compute n-grams
    return [words[i:i+n] for i in range(len(words)-n+1)]


def compute_n_gram_with_next_word(words, n):
    n_gram = compute_n_gram(words, n)[:-1]
    next_words = [words[i+n] for i in range(len(n_gram))]
    return n_gram, next_words

# Loading the data
path="text_2.txt"
text = open(path, encoding='utf8').read().lower()
# print ('Corpus length: ',len(text))

# Preprocessing
# Finding all the unique characters in the corpus
text = re.sub(r'[^ws]', '', text)  # remove punctuation
words = text.split()
vocab = sorted(list(set(words)))  # determine distinct words from corpus
vocab_size = len(vocab)
# define vocabulary
word_indices = dict((c, i) for i, c in enumerate(vocab))
indices_words = dict((i, c) for i, c in enumerate(vocab))


print("unique chars: ", len(words))

# Cutting the corpus into chunks of 10 chars, spacing the sequences by 3 characters
# We will additionally store the next character (the one we need to predict) for every sequence

SEQUENCE_LENGTH = 10
bigrams, next_word_bigrams = compute_n_gram_with_next_word(words, 2)
trigrams, next_word_trigrams = compute_n_gram_with_next_word(words, 3)
sentences = bigrams + trigrams
next_words = next_word_bigrams + next_word_trigrams
print('num training examples: ', len(sentences))

# Generating features and labels.
# Using previously generated sequences and characters that need to be predicted to create one-hot encoded vectors

X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(vocab)), dtype=np.bool)
y = np.zeros((len(sentences), len(vocab)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence):
        X[i, t, word_indices[word]] = 1
    y[i, word_indices[next_words[i]]] = 1

# Building the model

model = Sequential()
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, vocab_size)))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))

# Training
optimizer = RMSprop(lr=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=['accuracy'])

history = model.fit(X, y, validation_split=0.05, batch_size=128, epochs=1, shuffle=True).history


# Predicting

# Testing
def prepare_input(text):
    x = np.zeros((1, SEQUENCE_LENGTH, vocab_size))
    for t, word in enumerate(text.split()):
        x[0, t, word_indices[word]] = 1
    return x


# The sequences must be 40 chars long and the tensor is of the shape (1, 40, 57)


# The sample function
# This function allows us to ask our model what are the next probable characters (The heap simplifies the job)
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)


# Prediction function
def predict_completion(text):
    prediction = []
    while len(prediction) < SEQUENCE_LENGTH:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_word = indices_words[next_index]

        text = " ".join([text, next_word])
        prediction += [next_word]

    return " ".join(prediction)


# This methods wraps everything and allows us to predict multiple completions
def predict_completions(text, n=3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_words[idx] + predict_completion(text[1:] + indices_words[idx]) for idx in next_indices]

print(predict_completion("hello"))

Leave a Comment