.. raw:: html

.. code:: python from mxnet import gluon, init, np, npx from mxnet.gluon import nn, rnn from d2l import mxnet as d2l npx.set_np() batch_size = 64 train_iter, test_iter, vocab = d2l.load_data_imdb(batch_size) .. raw:: html

.. raw:: html

.. code:: python import torch from torch import nn from d2l import torch as d2l batch_size = 64 train_iter, test_iter, vocab = d2l.load_data_imdb(batch_size) .. parsed-literal:: :class: output Downloading ../data/aclImdb_v1.tar.gz from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz... .. raw:: html

.. raw:: html

mxnet pytorch

.. raw:: html

.. code:: python class BiRNN(nn.Block): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs): super(BiRNN, self).__init__(**kwargs) self.embedding = nn.Embedding(vocab_size, embed_size) # Set `bidirectional` to True to get a bidirectional RNN self.encoder = rnn.LSTM(num_hiddens, num_layers=num_layers, bidirectional=True, input_size=embed_size) self.decoder = nn.Dense(2) def forward(self, inputs): # The shape of `inputs` is (batch size, no. of time steps). Because # LSTM requires its input's first dimension to be the temporal # dimension, the input is transposed before obtaining token # representations. The output shape is (no. of time steps, batch size, # word vector dimension) embeddings = self.embedding(inputs.T) # Returns hidden states of the last hidden layer at different time # steps. The shape of `outputs` is (no. of time steps, batch size, # 2 * no. of hidden units) outputs = self.encoder(embeddings) # Concatenate the hidden states at the initial and final time steps as # the input of the fully-connected layer. Its shape is (batch size, # 4 * no. of hidden units) encoding = np.concatenate((outputs[0], outputs[-1]), axis=1) outs = self.decoder(encoding) return outs .. raw:: html

.. raw:: html

.. code:: python class BiRNN(nn.Module): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs): super(BiRNN, self).__init__(**kwargs) self.embedding = nn.Embedding(vocab_size, embed_size) # Set `bidirectional` to True to get a bidirectional RNN self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers, bidirectional=True) self.decoder = nn.Linear(4 * num_hiddens, 2) def forward(self, inputs): # The shape of `inputs` is (batch size, no. of time steps). Because # LSTM requires its input's first dimension to be the temporal # dimension, the input is transposed before obtaining token # representations. The output shape is (no. of time steps, batch size, # word vector dimension) embeddings = self.embedding(inputs.T) self.encoder.flatten_parameters() # Returns hidden states of the last hidden layer at different time # steps. The shape of `outputs` is (no. of time steps, batch size, # 2 * no. of hidden units) outputs, _ = self.encoder(embeddings) # Concatenate the hidden states of the initial time step and final # time step to use as the input of the fully connected layer. Its # shape is (batch size, 4 * no. of hidden units) encoding = torch.cat((outputs[0], outputs[-1]), dim=1) # Concatenate the hidden states at the initial and final time steps as # the input of the fully-connected layer. Its shape is (batch size, # 4 * no. of hidden units) outs = self.decoder(encoding) return outs .. raw:: html

.. raw:: html

mxnet pytorch

.. raw:: html

.. code:: python embed_size, num_hiddens, num_layers, devices = 100, 100, 2, d2l.try_all_gpus() net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers) net.initialize(init.Xavier(), ctx=devices) .. raw:: html

.. raw:: html

.. code:: python embed_size, num_hiddens, num_layers, devices = 100, 100, 2, d2l.try_all_gpus() net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers) def init_weights(m): if type(m) == nn.Linear: nn.init.xavier_uniform_(m.weight) if type(m) == nn.LSTM: for param in m._flat_weights_names: if "weight" in param: nn.init.xavier_uniform_(m._parameters[param]) net.apply(init_weights); .. raw:: html

.. raw:: html

mxnet pytorch

.. raw:: html

.. code:: python glove_embedding = d2l.TokenEmbedding('glove.6b.100d') .. raw:: html

.. raw:: html

.. code:: python glove_embedding = d2l.TokenEmbedding('glove.6b.100d') .. parsed-literal:: :class: output Downloading ../data/glove.6B.100d.zip from http://d2l-data.s3-accelerate.amazonaws.com/glove.6B.100d.zip... .. raw:: html

.. raw:: html

mxnet pytorch

.. raw:: html

.. code:: python embeds = glove_embedding[vocab.idx_to_token] embeds.shape .. parsed-literal:: :class: output (49346, 100) .. raw:: html

.. raw:: html

.. code:: python embeds = glove_embedding[vocab.idx_to_token] embeds.shape .. parsed-literal:: :class: output torch.Size([49346, 100]) .. raw:: html

.. raw:: html

mxnet pytorch

.. raw:: html

.. code:: python net.embedding.weight.set_data(embeds) net.embedding.collect_params().setattr('grad_req', 'null') .. raw:: html

.. raw:: html

.. code:: python net.embedding.weight.data.copy_(embeds) net.embedding.weight.requires_grad = False .. raw:: html

.. raw:: html

mxnet pytorch

.. raw:: html

.. code:: python lr, num_epochs = 0.01, 5 trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr}) loss = gluon.loss.SoftmaxCrossEntropyLoss() d2l.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices) .. parsed-literal:: :class: output loss 0.297, train acc 0.872, test acc 0.846 707.2 examples/sec on [gpu(0), gpu(1)] .. figure:: output_sentiment-analysis-rnn_6199ad_57_1.svg .. raw:: html

.. raw:: html

.. code:: python lr, num_epochs = 0.01, 5 trainer = torch.optim.Adam(net.parameters(), lr=lr) loss = nn.CrossEntropyLoss(reduction="none") d2l.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices) .. parsed-literal:: :class: output loss 0.304, train acc 0.875, test acc 0.849 709.9 examples/sec on [device(type='cuda', index=0), device(type='cuda', index=1)] .. figure:: output_sentiment-analysis-rnn_6199ad_60_1.svg .. raw:: html

.. raw:: html

mxnet pytorch

.. raw:: html

.. code:: python #@save def predict_sentiment(net, vocab, sequence): """Predict the sentiment of a text sequence.""" sequence = np.array(vocab[sequence.split()], ctx=d2l.try_gpu()) label = np.argmax(net(sequence.reshape(1, -1)), axis=1) return 'positive' if label == 1 else 'negative' .. raw:: html

.. raw:: html

.. code:: python #@save def predict_sentiment(net, vocab, sequence): """Predict the sentiment of a text sequence.""" sequence = torch.tensor(vocab[sequence.split()], device=d2l.try_gpu()) label = torch.argmax(net(sequence.reshape(1, -1)), dim=1) return 'positive' if label == 1 else 'negative' .. raw:: html

.. raw:: html

mxnet pytorch

.. raw:: html

.. code:: python predict_sentiment(net, vocab, 'this movie is so great') .. parsed-literal:: :class: output 'positive' .. code:: python predict_sentiment(net, vocab, 'this movie is so bad') .. parsed-literal:: :class: output 'negative' .. raw:: html

.. raw:: html