import numpy as np
import tensorflow as tf 

from google.oauth2 import service_account
from google.cloud import storage

tf.compat.v1.logging.set_verbosity('ERROR')
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

print(tf.__version__)

2.9.0

credentials = service_account.Credentials.from_service_account_file('credentials.json')
client = storage.Client(project=credentials.project_id, credentials=credentials)

bucket = client.get_bucket("harmon-kennedy")

blob = bucket.blob("all_jfk_speeches.txt")
text = blob.download_as_text()

text[:300]

'Of particular importance to South Dakota are the farm policies of the Republican party - the party of Benson, Nixon and Mundt - the party which offers our young people no incentive to return to the farm - which offers the farmer only the prospect of lower and lower income - and which offers the nati'

print(f'Length of text: {len(text)} characters')

Length of text: 7734579 characters

vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

67 unique characters

words = text.replace("\n", " ").split(" ")

clean_words = [word.lower() for word in words if word.isalpha()]

clean_text = " ".join(clean_words)

clean_text[:300]

'of particular importance to south dakota are the farm policies of the republican party the party of nixon and mundt the party which offers our young people no incentive to return to the farm which offers the farmer only the prospect of lower and lower income and which offers the nation the vision of'

print(f"{len(clean_words)} number of clean words")

1196835 number of clean words

print(f"{len(set(clean_words))} unique clean words")

19291 unique clean words

vocab_size = 12000
seq_length = 30

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer_layer = TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=seq_length,
)

vectorizer_layer.adapt([clean_text])

2024-01-28 17:07:18.274711: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.

voc = vectorizer_layer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

len(voc)

12000

word_index['of']

3

word_index['particular']

717

word_index['[UNK]']

1

words_seq = [clean_words[i:i + seq_length] for i in range(0, len(clean_words) - seq_length-1)]
next_word = [clean_words[i + seq_length] for i in range(0, len(clean_words) - seq_length-1)]

2024-01-28 17:07:21.834528: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2024-01-28 17:07:21.836850: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.

for words in words_seq[:2]:
    print(" ".join(words) + "\n")

of particular importance to south dakota are the farm policies of the republican party the party of nixon and mundt the party which offers our young people no incentive to

particular importance to south dakota are the farm policies of the republican party the party of nixon and mundt the party which offers our young people no incentive to return

next_cat = np.array([word_index.get(word, 1) for word in next_word])
next_cat[:2]

array([978,   5])

X = np.array([" ".join(words_seq[i]) for i in range(len(next_word))
              if next_cat[i] != 1]).reshape(-1,1)
X[:2]

array([['of particular importance to south dakota are the farm policies of the republican party the party of nixon and mundt the party which offers our young people no incentive to'],
       ['particular importance to south dakota are the farm policies of the republican party the party of nixon and mundt the party which offers our young people no incentive to return']],
      dtype='<U260')

next_word[:2]

['return', 'to']

y = np.array([cat for cat in next_cat if cat != 1])
y[:2]

array([978,   5])

X.shape

(1187726, 1)

vectorizer_layer.call(X).shape

TensorShape([1187726, 30])

y.shape

(1187726,)

vectorizer_layer.call(X[:2])

<tf.Tensor: shape=(2, 30), dtype=int64, numpy=
array([[   3,  717,  652,    5,  482, 2772,   16,    2,  143,  280,    3,
           2,  142,   81,    2,   81,    3,  192,    4, 8230,    2,   81,
          23, 1290,   13,  406,   57,   46, 3001,    5],
       [ 717,  652,    5,  482, 2772,   16,    2,  143,  280,    3,    2,
         142,   81,    2,   81,    3,  192,    4, 8230,    2,   81,   23,
        1290,   13,  406,   57,   46, 3001,    5,  978]])>

dataset = (tf.data.Dataset
             .from_tensor_slices((X, y))
             .shuffle(50000)
             .batch(128))

from typing import Tuple

def build_model(
             text: str, 
             seq_length: int,
             vocab_size: int, 
             embedding_dim: int, 
             units: int
) -> Tuple[TextVectorization, tf.keras.models.Sequential]:
    
    vectorizer_layer = TextVectorization(
                            standardize="lower_and_strip_punctuation",
                            max_tokens=vocab_size,
                            output_mode="int",
                            output_sequence_length=seq_length)
    
    vectorizer_layer.adapt([text])
    
    model = tf.keras.models.Sequential([
                    tf.keras.Input(shape=(1,), 
                                   dtype=tf.string, 
                                   name='text'),
                    vectorizer_layer,
                    tf.keras.layers.Embedding(vocab_size, embedding_dim),
                    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units)),
                    tf.keras.layers.Dense(vocab_size, activation='softmax')
    ])
    
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
                            initial_learning_rate=1e-1,
                            decay_steps=1000,
                            decay_rate=0.001)
    
    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
                  optimizer=tf.optimizers.Adam(learning_rate=lr_schedule))
    
    return vectorizer_layer, model

vectorizer, model = build_model(text=text, 
                                seq_length=seq_length, 
                                vocab_size=vocab_size, 
                                embedding_dim=128, 
                                units=64)

2024-01-28 17:07:48.997658: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.

model.summary()

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 text_vectorization_30 (Text  (None, 30)               0         
 Vectorization)                                                  
                                                                 
 embedding_19 (Embedding)    (None, 30, 128)           1536000   
                                                                 
 bidirectional_19 (Bidirecti  (None, 128)              74496     
 onal)                                                           
                                                                 
 dense_19 (Dense)            (None, 12000)             1548000   
                                                                 
=================================================================
Total params: 3,158,496
Trainable params: 3,158,496
Non-trainable params: 0
_________________________________________________________________

history = model.fit(dataset, epochs=3)

Epoch 1/3

2024-01-28 17:07:57.527292: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2024-01-28 17:07:57.777102: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2024-01-28 17:07:57.794880: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2024-01-28 17:07:58.171041: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2024-01-28 17:07:58.184822: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.

9280/9280 [==============================] - 474s 51ms/step - loss: 7.5869
Epoch 2/3
9280/9280 [==============================] - 465s 50ms/step - loss: 7.3280
Epoch 3/3
9280/9280 [==============================] - 477s 51ms/step - loss: 7.3280

model.save("jfk_model")

WARNING:absl:Found untraced functions such as gru_cell_61_layer_call_fn, gru_cell_61_layer_call_and_return_conditional_losses, gru_cell_62_layer_call_fn, gru_cell_62_layer_call_and_return_conditional_losses while saving (showing 4 of 4). These functions will not be directly callable after loading.

model = tf.keras.models.load_model("jfk_model")

2024-01-28 17:42:19.454192: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2024-01-28 17:42:19.459024: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.

X[0]

array(['of particular importance to south dakota are the farm policies of the republican party the party of nixon and mundt the party which offers our young people no incentive to'],
      dtype='<U260')

np.argmax(model.predict([X[0]]))

2024-01-28 17:42:20.799824: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2024-01-28 17:42:20.864376: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2024-01-28 17:42:20.870536: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.

1/1 [==============================] - 0s 474ms/step

2

voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
reverse_word_map = dict(map(reversed, word_index.items()))

str(reverse_word_map[
        np.argmax(
            model.predict(X[0])
        )
])

1/1 [==============================] - 0s 101ms/step

'the'

test = X[3342][0]
print(test)

fails to recognize that the problems of one industry may be different from it completely fails to respect the traditional practices widely accepted in the building it completely fails to

def next_words_greedy(input_str: str, n: int) -> str:
    final_str = ''
    for i in range(n):
        prediction = model.predict(np.array([input_str]), verbose=0)
        idx = np.argmax(prediction[0])
        next_word = str(reverse_word_map[idx])
        final_str += next_word + ' ' 
        input_str += ' ' + next_word
        input_str = ' '.join(input_str.split(' ')[1:])
    return final_str

next_words_greedy(test, 3)

'invited the much '

def next_words_distribution(input_str: str, n: int) -> str:
    final_str = input_str + ' '
    for i in range(n):
        prediction = model.predict(np.array([input_str]), verbose=0)
        idx = np.random.choice(vocab_size, p=prediction[0])
        next_word = str(reverse_word_map[idx])
        final_str += next_word + ' ' 
        input_str += ' ' + next_word
        input_str = ' '.join(input_str.split(' ')[1:])
    return final_str

next_words_distribution(test, 10)

'fails to recognize that the problems of one industry may be different from it completely fails to respect the traditional practices widely accepted in the building it completely fails to attorney shall at to hospital that attract on a demand '

Creating An AI-Based JFK Speech Writer: Part 2¶

Introduction ¶

Data Preparation ¶

A Bidirectional GRU Model ¶

Generating Text ¶

Next Steps ¶