Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade tensorflow to 2.1.0 #28

Merged
merged 4 commits into from Jan 17, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Expand Up @@ -7,7 +7,7 @@ workdir /storybro

env POETRY_VIRTUALENVS_CREATE=false

run pip install poetry tensorflow==1.15
run pip install poetry tensorflow==2.0.0

run touch README.md
copy pyproject.toml pyproject.toml
Expand Down
6 changes: 3 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Expand Up @@ -27,7 +27,7 @@ pyyaml = "5.2"
regex = "2019.12.20"
func_timeout = "4.3.5"
playsound = "1.2.2"
tensorflow = "1.15"
tensorflow = "2.0.0"
tracery = "0.1.1"
cryptography = "~2.8"
click = "7.0"
Expand Down
8 changes: 5 additions & 3 deletions storybro/generation/gpt2/generator.py
Expand Up @@ -5,6 +5,7 @@
import numpy as np

import tensorflow as tf
from tensorboard.plugins.hparams import api as hp

from storybro.generation.gpt2 import model, encoder, sample
from storybro.story.utils import cut_trailing_sentence, remove_profanity
Expand All @@ -31,14 +32,15 @@ def __init__(self, modelobj, generate_num=60, temperature=0.4, top_k=40, top_p=0
self.enc = encoder.get_encoder(self.model.root_path)
hparams = model.default_hparams()
with open(os.path.join(self.model.root_path, "hparams.json")) as f:
hparams.override_from_dict(json.load(f))
hparams = json.load(f)
seed = np.random.randint(0, 100000)

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
self.sess = tf.compat.v1.Session(config=config)

self.context = tf.placeholder(tf.int32, [self.batch_size, None])
tf.compat.v1.disable_eager_execution()
self.context = tf.compat.v1.placeholder(tf.int32, [self.batch_size, None])
# np.random.seed(seed)
# tf.set_random_seed(seed)
self.output = sample.sample_sequence(
Expand All @@ -51,7 +53,7 @@ def __init__(self, modelobj, generate_num=60, temperature=0.4, top_k=40, top_p=0
top_p=top_p,
)

saver = tf.train.Saver()
saver = tf.compat.v1.train.Saver()
ckpt = tf.train.latest_checkpoint(self.model.root_path)
saver.restore(self.sess, ckpt)

Expand Down
102 changes: 59 additions & 43 deletions storybro/generation/gpt2/model.py
@@ -1,24 +1,28 @@
import numpy as np

import tensorflow as tf
from tensorflow.contrib.training import HParams


def default_hparams():
return HParams(n_vocab=0, n_ctx=1024, n_embd=768, n_head=12, n_layer=12,)
return [
"n_vocab", 0,
"n_ctx", 1024,
"n_embd", 12,
"n_layer", 12,
]


def shape_list(x):
"""Deal with dynamic shape in tensorflow cleanly."""
static = x.shape.as_list()
dynamic = tf.shape(x)
dynamic = tf.shape(input=x)
return [dynamic[i] if s is None else s for i, s in enumerate(static)]


def softmax(x, axis=-1):
x = x - tf.reduce_max(x, axis=axis, keepdims=True)
x = x - tf.reduce_max(input_tensor=x, axis=axis, keepdims=True)
ex = tf.exp(x)
return ex / tf.reduce_sum(ex, axis=axis, keepdims=True)
return ex / tf.reduce_sum(input_tensor=ex, axis=axis, keepdims=True)


def gelu(x):
Expand All @@ -27,13 +31,23 @@ def gelu(x):

def norm(x, scope, *, axis=-1, epsilon=1e-5):
"""Normalize to mean = 0, std = 1, then do a diagonal affine transform."""
with tf.variable_scope(scope):
n_state = x.shape[-1].value
g = tf.get_variable("g", [n_state], initializer=tf.constant_initializer(1))
b = tf.get_variable("b", [n_state], initializer=tf.constant_initializer(0))
u = tf.reduce_mean(x, axis=axis, keepdims=True)
s = tf.reduce_mean(tf.square(x - u), axis=axis, keepdims=True)
x = (x - u) * tf.rsqrt(s + epsilon)
with tf.compat.v1.variable_scope(scope):
n_state = x.shape[-1]
g = tf.compat.v1.get_variable(
"g",
[n_state],
initializer=tf.compat.v1.constant_initializer(1),
use_resource=False,
)
b = tf.compat.v1.get_variable(
"b",
[n_state],
initializer=tf.compat.v1.constant_initializer(0),
use_resource=False,
)
u = tf.reduce_mean(input_tensor=x, axis=axis, keepdims=True)
s = tf.reduce_mean(input_tensor=tf.square(x - u), axis=axis, keepdims=True)
x = (x - u) * tf.math.rsqrt(s + epsilon)
x = x * g + b
return x

Expand All @@ -51,14 +65,14 @@ def merge_states(x):


def conv1d(x, scope, nf, *, w_init_stdev=0.02):
with tf.variable_scope(scope):
with tf.compat.v1.variable_scope(scope):
*start, nx = shape_list(x)
w = tf.get_variable(
w = tf.compat.v1.get_variable(
"w",
[1, nx, nf],
initializer=tf.random_normal_initializer(stddev=w_init_stdev),
initializer=tf.compat.v1.random_normal_initializer(stddev=w_init_stdev),
)
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0))
b = tf.compat.v1.get_variable("b", [nf], initializer=tf.compat.v1.constant_initializer(0))
c = tf.reshape(
tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b,
start + [nf],
Expand All @@ -79,19 +93,19 @@ def attention_mask(nd, ns, *, dtype):

def attn(x, scope, n_state, *, past, hparams):
assert x.shape.ndims == 3 # Should be [batch, sequence, features]
assert n_state % hparams.n_head == 0
assert n_state % hparams["n_head"] == 0
if past is not None:
assert (
past.shape.ndims == 5
) # Should be [batch, 2, heads, sequence, features], where 2 is [k, v]

def split_heads(x):
# From [batch, sequence, features] to [batch, heads, sequence, features]
return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3])
return tf.transpose(a=split_states(x, hparams["n_head"]), perm=[0, 2, 1, 3])

def merge_heads(x):
# Reverse of split_heads
return merge_states(tf.transpose(x, [0, 2, 1, 3]))
return merge_states(tf.transpose(a=x, perm=[0, 2, 1, 3]))

def mask_attn_weights(w):
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
Expand All @@ -104,14 +118,14 @@ def mask_attn_weights(w):
def multihead_attn(q, k, v):
# q, k, v have shape [batch, heads, sequence, features]
w = tf.matmul(q, k, transpose_b=True)
w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))
w = w * tf.math.rsqrt(tf.cast(v.shape[-1], w.dtype))

w = mask_attn_weights(w)
w = softmax(w)
a = tf.matmul(w, v)
return a

with tf.variable_scope(scope):
with tf.compat.v1.variable_scope(scope):
c = conv1d(x, "c_attn", n_state * 3)
q, k, v = map(split_heads, tf.split(c, 3, axis=2))
present = tf.stack([k, v], axis=1)
Expand All @@ -126,16 +140,16 @@ def multihead_attn(q, k, v):


def mlp(x, scope, n_state, *, hparams):
with tf.variable_scope(scope):
nx = x.shape[-1].value
with tf.compat.v1.variable_scope(scope):
nx = x.shape[-1]
h = gelu(conv1d(x, "c_fc", n_state))
h2 = conv1d(h, "c_proj", nx)
return h2


def block(x, scope, *, past, hparams):
with tf.variable_scope(scope):
nx = x.shape[-1].value
with tf.compat.v1.variable_scope(scope):
nx = x.shape[-1]
a, present = attn(norm(x, "ln_1"), "attn", nx, past=past, hparams=hparams)
x = x + a
m = mlp(norm(x, "ln_2"), "mlp", nx * 4, hparams=hparams)
Expand All @@ -146,60 +160,62 @@ def block(x, scope, *, past, hparams):
def past_shape(*, hparams, batch_size=None, sequence=None):
return [
batch_size,
hparams.n_layer,
hparams["n_layer"],
2,
hparams.n_head,
hparams["n_head"],
sequence,
hparams.n_embd // hparams.n_head,
hparams["n_embd"] // hparams["n_head"],
]


def expand_tile(value, size):
"""Add a new axis of given size."""
value = tf.convert_to_tensor(value, name="value")
value = tf.convert_to_tensor(value=value, name="value")
ndims = value.shape.ndims
return tf.tile(tf.expand_dims(value, axis=0), [size] + [1] * ndims)


def positions_for(tokens, past_length):
batch_size = tf.shape(tokens)[0]
nsteps = tf.shape(tokens)[1]
batch_size = tf.shape(input=tokens)[0]
nsteps = tf.shape(input=tokens)[1]
return expand_tile(past_length + tf.range(nsteps), batch_size)


def model(hparams, X, past=None, scope="model", reuse=False):
with tf.variable_scope(scope, reuse=reuse):
with tf.compat.v1.variable_scope(scope, reuse=reuse):
results = {}
batch, sequence = shape_list(X)

wpe = tf.get_variable(
wpe = tf.compat.v1.get_variable(
"wpe",
[hparams.n_ctx, hparams.n_embd],
initializer=tf.random_normal_initializer(stddev=0.01),
[hparams["n_ctx"], hparams["n_embd"]],
initializer=tf.compat.v1.random_normal_initializer(stddev=0.01),
use_resource=False,
)
wte = tf.get_variable(
wte = tf.compat.v1.get_variable(
"wte",
[hparams.n_vocab, hparams.n_embd],
initializer=tf.random_normal_initializer(stddev=0.02),
[hparams["n_vocab"], hparams["n_embd"]],
initializer=tf.compat.v1.random_normal_initializer(stddev=0.02),
use_resource=False,
)
past_length = 0 if past is None else tf.shape(past)[-2]
past_length = 0 if past is None else tf.shape(input=past)[-2]
h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))

# Transformer
presents = []
pasts = (
tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer
tf.unstack(past, axis=1) if past is not None else [None] * hparams["n_layer"]
)
assert len(pasts) == hparams.n_layer
assert len(pasts) == hparams["n_layer"]
for layer, past in enumerate(pasts):
h, present = block(h, "h%d" % layer, past=past, hparams=hparams)
presents.append(present)
results["present"] = tf.stack(presents, axis=1)
h = norm(h, "ln_f")

# Language model loss. Do tokens <n predict token n?
h_flat = tf.reshape(h, [batch * sequence, hparams.n_embd])
h_flat = tf.reshape(h, [batch * sequence, hparams["n_embd"]])
logits = tf.matmul(h_flat, wte, transpose_b=True)
logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])
logits = tf.reshape(logits, [batch, sequence, hparams["n_vocab"]])
results["logits"] = logits
return results
18 changes: 9 additions & 9 deletions storybro/generation/gpt2/sample.py
Expand Up @@ -25,13 +25,13 @@ def top_k_logits(logits, k):
def _top_k():
values, _ = tf.nn.top_k(logits, k=k)
min_values = values[:, -1, tf.newaxis]
return tf.where(
return tf.compat.v1.where(
logits < min_values,
tf.ones_like(logits, dtype=logits.dtype) * -1e10,
logits,
)

return tf.cond(tf.equal(k, 0), lambda: logits, lambda: _top_k(),)
return tf.cond(pred=tf.equal(k, 0), true_fn=lambda: logits, false_fn=lambda: _top_k(),)


def top_p_logits(logits, p):
Expand All @@ -44,13 +44,13 @@ def top_p_logits(logits, p):
tf.range(0, batch),
# number of indices to include
tf.maximum(
tf.reduce_sum(tf.cast(cumulative_probs <= p, tf.int32), axis=-1) - 1, 0
tf.reduce_sum(input_tensor=tf.cast(cumulative_probs <= p, tf.int32), axis=-1) - 1, 0
),
],
axis=-1,
)
min_values = tf.gather_nd(sorted_logits, indices)
return tf.where(logits < min_values, tf.ones_like(logits) * -1e10, logits,)
return tf.compat.v1.where(logits < min_values, tf.ones_like(logits) * -1e10, logits,)


def sample_sequence(
Expand All @@ -72,26 +72,26 @@ def sample_sequence(

def step(hparams, tokens, past=None):
lm_output = model.model(
hparams=hparams, X=tokens, past=past, reuse=tf.AUTO_REUSE
hparams=hparams, X=tokens, past=past, reuse=tf.compat.v1.AUTO_REUSE
)

logits = lm_output["logits"][:, :, : hparams.n_vocab]
logits = lm_output["logits"][:, :, : hparams["n_vocab"]]
presents = lm_output["present"]
presents.set_shape(model.past_shape(hparams=hparams, batch_size=batch_size))
return {
"logits": logits,
"presents": presents,
}

with tf.name_scope("sample_sequence"):
with tf.compat.v1.name_scope("sample_sequence"):

def body(past, prev, output):
next_outputs = step(hparams, prev, past=past)
logits = next_outputs["logits"][:, -1, :] / tf.to_float(temperature)
logits = next_outputs["logits"][:, -1, :] / tf.cast(temperature, dtype=tf.float32)
logits = penalize_used(logits, output)
logits = top_k_logits(logits, k=top_k)
logits = top_p_logits(logits, p=top_p)
samples = tf.multinomial(logits, num_samples=1, output_dtype=tf.int32)
samples = tf.random.categorical(logits=logits, num_samples=1, dtype=tf.int32)
return [
next_outputs["presents"]
if past is None
Expand Down