数据+代码 TensorFlow实现
学习视频
 一个公开的平行语料库
代码运行部分截图：

以下代码需在juypter里运行，python3

###1.模型需要的依赖模块们
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
import pickle

###2.读取数据，英文-德文翻译
X, Y, en_word2idx, en_idx2word, en_vocab, de_word2idx, de_idx2word, de_vocab = pickle.load(open("data.pkl", 'rb'), encoding='utf-8')

###3.检查一下数据的具体格式
print('Sentence in English - encoded:', X[0])
print('Sentence in German - encoded:', Y[0])
print('Decoded:\n------------------------')

print('英语句子：',end=' ')
for i in range(len(X[1])):
    print(en_idx2word[X[1][i]],end=' ')
    
print('\n德语句子：',end=' ')
for i in range(len(Y[1])):
    print(de_idx2word[Y[1][i]],end=' ')

###4.X,Y数据填充处理，让两种语言的句子等长，并划分出训练集、测试集
def data_padding(x, y, length = 15):
    for i in range(len(x)):
        x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']]
        y[i] = [de_word2idx['<go>']] + y[i] + [de_word2idx['<eos>']] + (length-len(y[i])) * [de_word2idx['<pad>']]

data_padding(X, Y)
X_train,  X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1)

del X
del Y

###5.搭建翻译模型
input_seq_len = 15
output_seq_len = 17
en_vocab_size = len(en_vocab) + 2 # + <pad>, <ukn>
de_vocab_size = len(de_vocab) + 4 # + <pad>, <ukn>, <eos>, <go>

# 占位符，len(encoder_inputs)=15，len(decoder_inputs)=17，len(targets)=15，len(target_weights)=17
encoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'encoder{}'.format(i)) for i in range(input_seq_len)]
decoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'decoder{}'.format(i)) for i in range(output_seq_len)]
targets = [decoder_inputs[i+1] for i in range(output_seq_len-1)]
targets.append(tf.placeholder(dtype = tf.int32, shape = [None], name = 'last_target'))
target_weights = [tf.placeholder(dtype = tf.float32, shape = [None], name = 'target_w{}'.format(i)) for i in range(output_seq_len)]

#输出预测
size = 512  #德语的词向量维度
w_t = tf.get_variable('proj_w', [de_vocab_size, size], tf.float32)  #德语词向量矩阵变量
b = tf.get_variable('proj_b', [de_vocab_size], tf.float32) #德语词向量偏量
w = tf.transpose(w_t)
output_projection = (w, b)

#直接调用TensorFlow的embedding_attention_seq2seq函数
outputs, states = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                                            encoder_inputs, 
                                            decoder_inputs, 
                                            tf.contrib.rnn.BasicLSTMCell(size),
                                            num_encoder_symbols = en_vocab_size,
                                            num_decoder_symbols = de_vocab_size,
                                            embedding_size = 100,
                                            feed_previous = False,
                                            output_projection = output_projection,
                                            dtype = tf.float32)

###6.定义模型的损失函数
# sampled softmax loss - returns: A batch_size 1-D tensor of per-example sampled softmax losses
def sampled_loss(labels, logits):
    return tf.nn.sampled_softmax_loss(
                        weights = w_t,
                        biases = b,
                        labels = tf.reshape(labels, [-1, 1]),
                        inputs = logits,
                        num_sampled = 512,
                        num_classes = de_vocab_size)

# 预测序列与目标序列的log交叉熵损失函数
loss = tf.contrib.legacy_seq2seq.sequence_loss(outputs, targets, target_weights, softmax_loss_function = sampled_loss)

###7.自定义一些需要用到的功能函数
# 自定义的softmax函数
def softmax(x):
    n = np.max(x)
    e_x = np.exp(x - n)
    return e_x / e_x.sum()

# 自定义占位符feed函数
def feed_dict(x, y, batch_size = 64):
    feed = {}
    idxes = np.random.choice(len(x), size = batch_size, replace = False)
    
    for i in range(input_seq_len):
        feed[encoder_inputs[i].name] = np.array([x[j][i] for j in idxes], dtype = np.int32)
        
    for i in range(output_seq_len):
        feed[decoder_inputs[i].name] = np.array([y[j][i] for j in idxes], dtype = np.int32)
        
    feed[targets[len(targets)-1].name] = np.full(shape = [batch_size], fill_value = de_word2idx['<pad>'], dtype = np.int32)
    
    for i in range(output_seq_len-1):
        batch_weights = np.ones(batch_size, dtype = np.float32)
        target = feed[decoder_inputs[i+1].name]
        for j in range(batch_size):
            if target[j] == de_word2idx['<pad>']:
                batch_weights[j] = 0.0
        feed[target_weights[i].name] = batch_weights
        
    feed[target_weights[output_seq_len-1].name] = np.zeros(batch_size, dtype = np.float32)
    return feed

# 自定义编码器输出序列decode output函数
def decode_output(output_seq):
    words = []
    for i in range(output_seq_len):
        smax = softmax(output_seq[i])
        idx = np.argmax(smax)
        words.append(de_idx2word[idx])
    return words

# ops and hyperparameters
learning_rate = 5e-3
batch_size = 64
steps = 10 ###注：此处原始值为1000，设置10是为了快速检验模型的可运行性

###8.模型计算
# ops for projecting outputs
outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]
# training op
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)
# init op
init = tf.global_variables_initializer()
# forward step
def forward_step(sess, feed):
    output_sequences = sess.run(outputs_proj, feed_dict = feed)
    return output_sequences
# training step
def backward_step(sess, feed):
    sess.run(optimizer, feed_dict = feed)

###9.模型保存于训练
losses = []
saver = tf.train.Saver() #模型保存
print('------------------TRAINING------------------')

with tf.Session() as sess:
    sess.run(init)
    t = time.time()
    for step in range(steps):
        feed = feed_dict(X_train, Y_train)
        backward_step(sess, feed)
        
        if step % 5 == 4 or step == 0:
            loss_value = sess.run(loss, feed_dict = feed)
            print('step: {}, loss: {}'.format(step, loss_value))
            losses.append(loss_value)
        
        if step % 20 == 19:
            saver.save(sess, 'checkpoints/', global_step=step)
            print('Checkpoint is saved')

    print('Training time for {} steps: {}s'.format(steps, time.time() - t))

###10.画出模型损失函数变化图
with plt.style.context('fivethirtyeight'):
    plt.plot(losses, linewidth = 1)
    plt.xlabel('Steps')
    plt.ylabel('Losses')
    plt.ylim((0, 12))
plt.show()

###11.对训练好的模型进行测试
with tf.Graph().as_default():
    # placeholders
    encoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'encoder{}'.format(i)) for i in range(input_seq_len)]
    decoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'decoder{}'.format(i)) for i in range(output_seq_len)]
    # output projection
    size = 512
    w_t = tf.get_variable('proj_w', [de_vocab_size, size], tf.float32)
    b = tf.get_variable('proj_b', [de_vocab_size], tf.float32)
    w = tf.transpose(w_t)
    output_projection = (w, b)
    # change the model so that output at time t can be fed as input at time t+1
    outputs, states = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                                                encoder_inputs,
                                                decoder_inputs,
                                                tf.contrib.rnn.BasicLSTMCell(size),
                                                num_encoder_symbols = en_vocab_size,
                                                num_decoder_symbols = de_vocab_size,
                                                embedding_size = 100,
                                                feed_previous = True, # <-----this is changed----->
                                                output_projection = output_projection,
                                                dtype = tf.float32)
    # ops for projecting outputs
    outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]

    # let's translate these sentences     
    en_sentences = ["What' s your name", 'My name is', 'What are you doing', 'I am reading a book',\
                    'How are you', 'I am good', 'Do you speak English', 'What time is it', 'Hi', 'Goodbye', 'Yes', 'No']
    en_sentences_encoded = [[en_word2idx.get(word, 0) for word in en_sentence.split()] for en_sentence in en_sentences]
    
    # padding to fit encoder input
    for i in range(len(en_sentences_encoded)):
        en_sentences_encoded[i] += (15 - len(en_sentences_encoded[i])) * [en_word2idx['<pad>']]
    
    # restore all variables - use the last checkpoint saved
    saver = tf.train.Saver()
    path = tf.train.latest_checkpoint('checkpoints')
    
    with tf.Session() as sess:
        # restore
        saver.restore(sess, path)
        # feed data into placeholders
        feed = {}
        for i in range(input_seq_len):
            feed[encoder_inputs[i].name] = np.array([en_sentences_encoded[j][i] for j in range(len(en_sentences_encoded))], dtype = np.int32)
            
        feed[decoder_inputs[0].name] = np.array([de_word2idx['<go>']] * len(en_sentences_encoded), dtype = np.int32)
        
        # translate
        output_sequences = sess.run(outputs_proj, feed_dict = feed)
        # decode seq.
        for i in range(len(en_sentences_encoded)):
            print('{}.\n--------------------------------'.format(i+1))
            ouput_seq = [output_sequences[j][i] for j in range(output_seq_len)]
            #decode output sequence
            words = decode_output(ouput_seq)
        
            print(en_sentences[i])
            for i in range(len(words)):
                if words[i] not in ['<eos>', '<pad>', '<go>']:
                    print(words[i],end=' ')
            
            print('\n--------------------------------')

数据处理部分的代码：

data.en数据格式，每一个句子为一行:

I was a Ph.D. student in clinical psychology at Berkeley. 
She was a 26-year-old woman named Alex. 
Now Alex walked into her first session wearing jeans and a big slouchy top, and she dropped onto the couch in my office and kicked off her flats and told me she was there to talk about guy problems. 
Now when I heard this, I was so relieved. 
My classmate got an arsonist for her first client. 
And I got a twentysomething who wanted to talk about boys. 
This I thought I could handle. 
But I didn't handle it. 
With the funny stories that Alex would bring to session, it was easy for me just to nod my head while we kicked the can down the road.

data.de数据格式，每一个句子为一行：

Als ich in meinen 20ern war, hatte ich meine erste Psychotherapie-Patientin. 
Ich war Doktorandin und studierte Klinische Psychologie in Berkeley. 
Sie war eine 26-jährige Frau namens Alex. 
Als Alex in die erste Sitzung kam, trug sie Jeans und ein ausgebeultes Top. Sie fiel auf das Sofa in meinem Büro, schleuderte ihre Sandalen von sich und erzählte mir, sie wäre da, um über Männerprobleme zu reden. 
Und als ich das hörte, war ich erleichtert. 
Meine Kommilitonin bekam nämlich einen Brandstifter als ersten Patienten. 
Und ich bekam eine Frau in den 20ern, die über Jungs reden wollte. 
Das kriege ich hin, dachte ich mir. 
Aber ich habe es nicht hingekriegt. 
Mit den lustigen Geschichten, die Alex mit in die Sitzung brachte, war es leicht für mich, einfach mit dem Kopf zu nicken, während wir die Probleme vor uns herschoben.

以下代码是对以上平行语料数据进行处理，python3，jupyter中运行

import pickle
from collections import Counter
from operator import itemgetter

def read_sentences(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as reader:
        for s in reader:
            sentences.append(s.strip())

    return sentences

def create_dataset(en_sentences, de_sentences):

    en_vocab_dict = Counter(word.strip(',." ;:)(][?!') for sentence in en_sentences for word in sentence.split())
    de_vocab_dict = Counter(word.strip(',." ;:)(][?!') for sentence in de_sentences for word in sentence.split())

    en_vocab = list(map(lambda x: x[0], sorted(en_vocab_dict.items(), key = lambda x: -x[1])))
    de_vocab = list(map(lambda x: x[0], sorted(de_vocab_dict.items(), key = lambda x: -x[1])))


#   en_vocab = en_vocab[:20000]
#   de_vocab = de_vocab[:30000]

    start_idx = 2
    en_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(en_vocab)])
    en_word2idx['<ukn>'] = 0
    en_word2idx['<pad>'] = 1

    en_idx2word = dict([(idx, word) for word, idx in en_word2idx.items()])

    start_idx = 4
    de_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(de_vocab)])
    de_word2idx['<ukn>'] = 0
    de_word2idx['<go>']  = 1
    de_word2idx['<eos>'] = 2
    de_word2idx['<pad>'] = 3

    de_idx2word = dict([(idx, word) for word, idx in de_word2idx.items()])

    x = [[en_word2idx.get(word.strip(',." ;:)(][?!'), 0) for word in sentence.split()] for sentence in en_sentences]
    y = [[de_word2idx.get(word.strip(',." ;:)(][?!'), 0) for word in sentence.split()] for sentence in de_sentences]

    X = []
    Y = []
    for i in range(len(x)):
        n1 = len(x[i])
        n2 = len(y[i])
        n = n1 if n1 < n2 else n2 
        if abs(n1 - n2) <= 0.3 * n:
            if n1 <= 15 and n2 <= 15:
                X.append(x[i])
                Y.append(y[i])

    return X, Y, en_word2idx, en_idx2word, en_vocab, de_word2idx, de_idx2word, de_vocab

def save_dataset(file_path, obj):
    with open(file_path, 'wb') as f:
        pickle.dump(obj, f, -1)

def read_dataset(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

en_sentences = read_sentences('data.en')
de_sentences = read_sentences('data.de')
save_dataset('demo_data.pkl', create_dataset(en_sentences, de_sentences))

英文-德文翻译完整代码解析

英文-德文翻译完整代码解析

数据处理部分的代码：