英文-德文翻译完整代码解析

数据+代码 TensorFlow实现
学习视频
一个公开的平行语料库
代码运行部分截图:

以下代码需在juypter里运行,python3

###1.模型需要的依赖模块们
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
import pickle

###2.读取数据,英文-德文翻译
X, Y, en_word2idx, en_idx2word, en_vocab, de_word2idx, de_idx2word, de_vocab = pickle.load(open("data.pkl", 'rb'), encoding='utf-8')

###3.检查一下数据的具体格式
print('Sentence in English - encoded:', X[0])
print('Sentence in German - encoded:', Y[0])
print('Decoded:\n------------------------')

print('英语句子:',end=' ')
for i in range(len(X[1])):
    print(en_idx2word[X[1][i]],end=' ')
    
print('\n德语句子:',end=' ')
for i in range(len(Y[1])):
    print(de_idx2word[Y[1][i]],end=' ')

###4.X,Y数据填充处理,让两种语言的句子等长,并划分出训练集、测试集
def data_padding(x, y, length = 15):
    for i in range(len(x)):
        x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']]
        y[i] = [de_word2idx['<go>']] + y[i] + [de_word2idx['<eos>']] + (length-len(y[i])) * [de_word2idx['<pad>']]

data_padding(X, Y)
X_train,  X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1)

del X
del Y

###5.搭建翻译模型
input_seq_len = 15
output_seq_len = 17
en_vocab_size = len(en_vocab) + 2 # + <pad>, <ukn>
de_vocab_size = len(de_vocab) + 4 # + <pad>, <ukn>, <eos>, <go>

# 占位符,len(encoder_inputs)=15,len(decoder_inputs)=17,len(targets)=15,len(target_weights)=17
encoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'encoder{}'.format(i)) for i in range(input_seq_len)]
decoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'decoder{}'.format(i)) for i in range(output_seq_len)]
targets = [decoder_inputs[i+1] for i in range(output_seq_len-1)]
targets.append(tf.placeholder(dtype = tf.int32, shape = [None], name = 'last_target'))
target_weights = [tf.placeholder(dtype = tf.float32, shape = [None], name = 'target_w{}'.format(i)) for i in range(output_seq_len)]

#输出预测
size = 512  #德语的词向量维度
w_t = tf.get_variable('proj_w', [de_vocab_size, size], tf.float32)  #德语词向量矩阵变量
b = tf.get_variable('proj_b', [de_vocab_size], tf.float32) #德语词向量偏量
w = tf.transpose(w_t)
output_projection = (w, b)

#直接调用TensorFlow的embedding_attention_seq2seq函数
outputs, states = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                                            encoder_inputs, 
                                            decoder_inputs, 
                                            tf.contrib.rnn.BasicLSTMCell(size),
                                            num_encoder_symbols = en_vocab_size,
                                            num_decoder_symbols = de_vocab_size,
                                            embedding_size = 100,
                                            feed_previous = False,
                                            output_projection = output_projection,
                                            dtype = tf.float32)

###6.定义模型的损失函数
# sampled softmax loss - returns: A batch_size 1-D tensor of per-example sampled softmax losses
def sampled_loss(labels, logits):
    return tf.nn.sampled_softmax_loss(
                        weights = w_t,
                        biases = b,
                        labels = tf.reshape(labels, [-1, 1]),
                        inputs = logits,
                        num_sampled = 512,
                        num_classes = de_vocab_size)

# 预测序列与目标序列的log交叉熵损失函数
loss = tf.contrib.legacy_seq2seq.sequence_loss(outputs, targets, target_weights, softmax_loss_function = sampled_loss)

###7.自定义一些需要用到的功能函数
# 自定义的softmax函数
def softmax(x):
    n = np.max(x)
    e_x = np.exp(x - n)
    return e_x / e_x.sum()

# 自定义占位符feed函数
def feed_dict(x, y, batch_size = 64):
    feed = {}
    idxes = np.random.choice(len(x), size = batch_size, replace = False)
    
    for i in range(input_seq_len):
        feed[encoder_inputs[i].name] = np.array([x[j][i] for j in idxes], dtype = np.int32)
        
    for i in range(output_seq_len):
        feed[decoder_inputs[i].name] = np.array([y[j][i] for j in idxes], dtype = np.int32)
        
    feed[targets[len(targets)-1].name] = np.full(shape = [batch_size], fill_value = de_word2idx['<pad>'], dtype = np.int32)
    
    for i in range(output_seq_len-1):
        batch_weights = np.ones(batch_size, dtype = np.float32)
        target = feed[decoder_inputs[i+1].name]
        for j in range(batch_size):
            if target[j] == de_word2idx['<pad>']:
                batch_weights[j] = 0.0
        feed[target_weights[i].name] = batch_weights
        
    feed[target_weights[output_seq_len-1].name] = np.zeros(batch_size, dtype = np.float32)
    return feed

# 自定义编码器输出序列decode output函数
def decode_output(output_seq):
    words = []
    for i in range(output_seq_len):
        smax = softmax(output_seq[i])
        idx = np.argmax(smax)
        words.append(de_idx2word[idx])
    return words

# ops and hyperparameters
learning_rate = 5e-3
batch_size = 64
steps = 10 ###注:此处原始值为1000,设置10是为了快速检验模型的可运行性

###8.模型计算
# ops for projecting outputs
outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]
# training op
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)
# init op
init = tf.global_variables_initializer()
# forward step
def forward_step(sess, feed):
    output_sequences = sess.run(outputs_proj, feed_dict = feed)
    return output_sequences
# training step
def backward_step(sess, feed):
    sess.run(optimizer, feed_dict = feed)

###9.模型保存于训练
losses = []
saver = tf.train.Saver() #模型保存
print('------------------TRAINING------------------')

with tf.Session() as sess:
    sess.run(init)
    t = time.time()
    for step in range(steps):
        feed = feed_dict(X_train, Y_train)
        backward_step(sess, feed)
        
        if step % 5 == 4 or step == 0:
            loss_value = sess.run(loss, feed_dict = feed)
            print('step: {}, loss: {}'.format(step, loss_value))
            losses.append(loss_value)
        
        if step % 20 == 19:
            saver.save(sess, 'checkpoints/', global_step=step)
            print('Checkpoint is saved')

    print('Training time for {} steps: {}s'.format(steps, time.time() - t))

###10.画出模型损失函数变化图
with plt.style.context('fivethirtyeight'):
    plt.plot(losses, linewidth = 1)
    plt.xlabel('Steps')
    plt.ylabel('Losses')
    plt.ylim((0, 12))
plt.show()

###11.对训练好的模型进行测试
with tf.Graph().as_default():
    # placeholders
    encoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'encoder{}'.format(i)) for i in range(input_seq_len)]
    decoder_inputs = [tf.placeholder(dtype = tf.int32, shape = [None], name = 'decoder{}'.format(i)) for i in range(output_seq_len)]
    # output projection
    size = 512
    w_t = tf.get_variable('proj_w', [de_vocab_size, size], tf.float32)
    b = tf.get_variable('proj_b', [de_vocab_size], tf.float32)
    w = tf.transpose(w_t)
    output_projection = (w, b)
    # change the model so that output at time t can be fed as input at time t+1
    outputs, states = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                                                encoder_inputs,
                                                decoder_inputs,
                                                tf.contrib.rnn.BasicLSTMCell(size),
                                                num_encoder_symbols = en_vocab_size,
                                                num_decoder_symbols = de_vocab_size,
                                                embedding_size = 100,
                                                feed_previous = True, # <-----this is changed----->
                                                output_projection = output_projection,
                                                dtype = tf.float32)
    # ops for projecting outputs
    outputs_proj = [tf.matmul(outputs[i], output_projection[0]) + output_projection[1] for i in range(output_seq_len)]

    # let's translate these sentences     
    en_sentences = ["What' s your name", 'My name is', 'What are you doing', 'I am reading a book',\
                    'How are you', 'I am good', 'Do you speak English', 'What time is it', 'Hi', 'Goodbye', 'Yes', 'No']
    en_sentences_encoded = [[en_word2idx.get(word, 0) for word in en_sentence.split()] for en_sentence in en_sentences]
    
    # padding to fit encoder input
    for i in range(len(en_sentences_encoded)):
        en_sentences_encoded[i] += (15 - len(en_sentences_encoded[i])) * [en_word2idx['<pad>']]
    
    # restore all variables - use the last checkpoint saved
    saver = tf.train.Saver()
    path = tf.train.latest_checkpoint('checkpoints')
    
    with tf.Session() as sess:
        # restore
        saver.restore(sess, path)
        # feed data into placeholders
        feed = {}
        for i in range(input_seq_len):
            feed[encoder_inputs[i].name] = np.array([en_sentences_encoded[j][i] for j in range(len(en_sentences_encoded))], dtype = np.int32)
            
        feed[decoder_inputs[0].name] = np.array([de_word2idx['<go>']] * len(en_sentences_encoded), dtype = np.int32)
        
        # translate
        output_sequences = sess.run(outputs_proj, feed_dict = feed)
        # decode seq.
        for i in range(len(en_sentences_encoded)):
            print('{}.\n--------------------------------'.format(i+1))
            ouput_seq = [output_sequences[j][i] for j in range(output_seq_len)]
            #decode output sequence
            words = decode_output(ouput_seq)
        
            print(en_sentences[i])
            for i in range(len(words)):
                if words[i] not in ['<eos>', '<pad>', '<go>']:
                    print(words[i],end=' ')
            
            print('\n--------------------------------')

数据处理部分的代码:

data.en数据格式,每一个句子为一行:

I was a Ph.D. student in clinical psychology at Berkeley. 
She was a 26-year-old woman named Alex. 
Now Alex walked into her first session wearing jeans and a big slouchy top, and she dropped onto the couch in my office and kicked off her flats and told me she was there to talk about guy problems. 
Now when I heard this, I was so relieved. 
My classmate got an arsonist for her first client. 
And I got a twentysomething who wanted to talk about boys. 
This I thought I could handle. 
But I didn't handle it. 
With the funny stories that Alex would bring to session, it was easy for me just to nod my head while we kicked the can down the road.

data.de数据格式,每一个句子为一行:

Als ich in meinen 20ern war, hatte ich meine erste Psychotherapie-Patientin. 
Ich war Doktorandin und studierte Klinische Psychologie in Berkeley. 
Sie war eine 26-jährige Frau namens Alex. 
Als Alex in die erste Sitzung kam, trug sie Jeans und ein ausgebeultes Top. Sie fiel auf das Sofa in meinem Büro, schleuderte ihre Sandalen von sich und erzählte mir, sie wäre da, um über Männerprobleme zu reden. 
Und als ich das hörte, war ich erleichtert. 
Meine Kommilitonin bekam nämlich einen Brandstifter als ersten Patienten. 
Und ich bekam eine Frau in den 20ern, die über Jungs reden wollte. 
Das kriege ich hin, dachte ich mir. 
Aber ich habe es nicht hingekriegt. 
Mit den lustigen Geschichten, die Alex mit in die Sitzung brachte, war es leicht für mich, einfach mit dem Kopf zu nicken, während wir die Probleme vor uns herschoben. 

以下代码是对以上平行语料数据进行处理,python3,jupyter中运行

import pickle
from collections import Counter
from operator import itemgetter

def read_sentences(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as reader:
        for s in reader:
            sentences.append(s.strip())

    return sentences

def create_dataset(en_sentences, de_sentences):

    en_vocab_dict = Counter(word.strip(',." ;:)(][?!') for sentence in en_sentences for word in sentence.split())
    de_vocab_dict = Counter(word.strip(',." ;:)(][?!') for sentence in de_sentences for word in sentence.split())

    en_vocab = list(map(lambda x: x[0], sorted(en_vocab_dict.items(), key = lambda x: -x[1])))
    de_vocab = list(map(lambda x: x[0], sorted(de_vocab_dict.items(), key = lambda x: -x[1])))


#   en_vocab = en_vocab[:20000]
#   de_vocab = de_vocab[:30000]

    start_idx = 2
    en_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(en_vocab)])
    en_word2idx['<ukn>'] = 0
    en_word2idx['<pad>'] = 1

    en_idx2word = dict([(idx, word) for word, idx in en_word2idx.items()])

    start_idx = 4
    de_word2idx = dict([(word, idx+start_idx) for idx, word in enumerate(de_vocab)])
    de_word2idx['<ukn>'] = 0
    de_word2idx['<go>']  = 1
    de_word2idx['<eos>'] = 2
    de_word2idx['<pad>'] = 3

    de_idx2word = dict([(idx, word) for word, idx in de_word2idx.items()])

    x = [[en_word2idx.get(word.strip(',." ;:)(][?!'), 0) for word in sentence.split()] for sentence in en_sentences]
    y = [[de_word2idx.get(word.strip(',." ;:)(][?!'), 0) for word in sentence.split()] for sentence in de_sentences]

    X = []
    Y = []
    for i in range(len(x)):
        n1 = len(x[i])
        n2 = len(y[i])
        n = n1 if n1 < n2 else n2 
        if abs(n1 - n2) <= 0.3 * n:
            if n1 <= 15 and n2 <= 15:
                X.append(x[i])
                Y.append(y[i])

    return X, Y, en_word2idx, en_idx2word, en_vocab, de_word2idx, de_idx2word, de_vocab

def save_dataset(file_path, obj):
    with open(file_path, 'wb') as f:
        pickle.dump(obj, f, -1)

def read_dataset(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

en_sentences = read_sentences('data.en')
de_sentences = read_sentences('data.de')
save_dataset('demo_data.pkl', create_dataset(en_sentences, de_sentences))
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 204,793评论 6 478
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 87,567评论 2 381
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 151,342评论 0 338
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 54,825评论 1 277
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 63,814评论 5 368
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,680评论 1 281
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 38,033评论 3 399
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,687评论 0 258
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 42,175评论 1 300
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,668评论 2 321
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,775评论 1 332
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,419评论 4 321
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 39,020评论 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,978评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,206评论 1 260
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 45,092评论 2 351
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,510评论 2 343